# Library includes
################################################################################
ifeq ($(CONFIG_LIBINTEL_INTRINSICS),y)
-CINCLUDES-$(CONFIG_LIBINTEL_INTRINSICS) += -I$(LIBINTEL_INTRINSICS_BASE)/include
-CXXINCLUDES-$(CONFIG_LIBINTEL_INTRINSICS) += -I$(LIBINTEL_INTRINSICS_BASE)/include
+CINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm
+CXXINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm
endif
--- /dev/null
+/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_AES_H
+#define __WMMINTRIN_AES_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
+
+/// Performs a single round of AES encryption using the Equivalent
+/// Inverse Cipher, transforming the state value from the first source
+/// operand using a 128-bit round key value contained in the second source
+/// operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
+///
+/// \param __V
+/// A 128-bit integer vector containing the state value.
+/// \param __R
+/// A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenc_si128(__m128i __V, __m128i __R)
+{
+ return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
+}
+
+/// Performs the final round of AES encryption using the Equivalent
+/// Inverse Cipher, transforming the state value from the first source
+/// operand using a 128-bit round key value contained in the second source
+/// operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
+///
+/// \param __V
+/// A 128-bit integer vector containing the state value.
+/// \param __R
+/// A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the encrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesenclast_si128(__m128i __V, __m128i __R)
+{
+ return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// Performs a single round of AES decryption using the Equivalent
+/// Inverse Cipher, transforming the state value from the first source
+/// operand using a 128-bit round key value contained in the second source
+/// operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
+///
+/// \param __V
+/// A 128-bit integer vector containing the state value.
+/// \param __R
+/// A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdec_si128(__m128i __V, __m128i __R)
+{
+ return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
+}
+
+/// Performs the final round of AES decryption using the Equivalent
+/// Inverse Cipher, transforming the state value from the first source
+/// operand using a 128-bit round key value contained in the second source
+/// operand, and writes the result to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
+///
+/// \param __V
+/// A 128-bit integer vector containing the state value.
+/// \param __R
+/// A 128-bit integer vector containing the round key value.
+/// \returns A 128-bit integer vector containing the decrypted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesdeclast_si128(__m128i __V, __m128i __R)
+{
+ return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
+}
+
+/// Applies the AES InvMixColumns() transformation to an expanded key
+/// contained in the source operand, and writes the result to the
+/// destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
+///
+/// \param __V
+/// A 128-bit integer vector containing the expanded key.
+/// \returns A 128-bit integer vector containing the transformed value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_aesimc_si128(__m128i __V)
+{
+ return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
+}
+
+/// Generates a round key for AES encryption, operating on 128-bit data
+/// specified in the first source operand and using an 8-bit round constant
+/// specified by the second source operand, and writes the result to the
+/// destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
+///
+/// \param C
+/// A 128-bit integer vector that is used to generate the AES encryption key.
+/// \param R
+/// An 8-bit round constant used to generate the AES encryption key.
+/// \returns A 128-bit round key for AES encryption.
+#define _mm_aeskeygenassist_si128(C, R) \
+ ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __WMMINTRIN_AES_H */
--- /dev/null
+/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __WMMINTRIN_H
+#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
+#endif
+
+#ifndef __WMMINTRIN_PCLMUL_H
+#define __WMMINTRIN_PCLMUL_H
+
+/// Multiplies two 64-bit integer values, which are selected from source
+/// operands using the immediate-value operand. The multiplication is a
+/// carry-less multiplication, and the 128-bit integer product is stored in
+/// the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
+///
+/// \param __X
+/// A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __Y
+/// A 128-bit vector of [2 x i64] containing one of the source operands.
+/// \param __I
+/// An immediate value specifying which 64-bit values to select from the
+/// operands. Bit 0 is used to select a value from operand \a __X, and bit
+/// 4 is used to select a value from operand \a __Y: \n
+/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
+/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
+/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
+/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
+/// \returns The 128-bit integer vector containing the result of the carry-less
+/// multiplication of the selected 64-bit values.
+#define _mm_clmulepi64_si128(X, Y, I) \
+ ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (char)(I)))
+
+#endif /* __WMMINTRIN_PCLMUL_H */
--- /dev/null
+/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ADXINTRIN_H
+#define __ADXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+/* Intrinsics that are available only if __ADX__ defined */
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+ unsigned int *__p)
+{
+ return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
+_addcarryx_u64(unsigned char __cf, unsigned long long __x,
+ unsigned long long __y, unsigned long long *__p)
+{
+ return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+/* Intrinsics that are also available if __ADX__ undefined */
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+ unsigned int *__p)
+{
+ return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_addcarry_u64(unsigned char __cf, unsigned long long __x,
+ unsigned long long __y, unsigned long long *__p)
+{
+ return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
+}
+#endif
+
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
+ unsigned int *__p)
+{
+ return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
+}
+
+#ifdef __x86_64__
+static __inline unsigned char __DEFAULT_FN_ATTRS
+_subborrow_u64(unsigned char __cf, unsigned long long __x,
+ unsigned long long __y, unsigned long long *__p)
+{
+ return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __ADXINTRIN_H */
--- /dev/null
+/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __AMMINTRIN_H
+#define __AMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
+
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
+/// integer vector operand at the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param x
+/// The value from which bits are extracted.
+/// \param len
+/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+/// are zero, the length is interpreted as 64.
+/// \param idx
+/// Bits [5:0] specify the index of the least significant bit; the other
+/// bits are ignored. If the sum of the index and length is greater than 64,
+/// the result is undefined. If the length and index are both zero, bits
+/// [63:0] of parameter \a x are extracted. If the length is zero but the
+/// index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
+/// extracted from the source operand.
+#define _mm_extracti_si64(x, len, idx) \
+ ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
+ (char)(len), (char)(idx)))
+
+/// Extracts the specified bits from the lower 64 bits of the 128-bit
+/// integer vector operand at the index and of the length specified by
+/// \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
+///
+/// \param __x
+/// The value from which bits are extracted.
+/// \param __y
+/// Specifies the index of the least significant bit at [13:8] and the
+/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
+/// length is interpreted as 64. If the sum of the index and length is
+/// greater than 64, the result is undefined. If the length and index are
+/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
+/// is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
+/// from the source operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_extract_si64(__m128i __x, __m128i __y)
+{
+ return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
+}
+
+/// Inserts bits of a specified length from the source integer vector
+/// \a y into the lower 64 bits of the destination integer vector \a x at
+/// the index \a idx and of the length \a len.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
+/// const int idx);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param x
+/// The destination operand where bits will be inserted. The inserted bits
+/// are defined by the length \a len and by the index \a idx specifying the
+/// least significant bit.
+/// \param y
+/// The source operand containing the bits to be extracted. The extracted
+/// bits are the least significant bits of operand \a y of length \a len.
+/// \param len
+/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
+/// are zero, the length is interpreted as 64.
+/// \param idx
+/// Bits [5:0] specify the index of the least significant bit; the other
+/// bits are ignored. If the sum of the index and length is greater than 64,
+/// the result is undefined. If the length and index are both zero, bits
+/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
+/// is zero but the index is non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+/// destination operand \a x with the specified bitfields replaced by the
+/// lower bits of source operand \a y. The upper 64 bits of the return value
+/// are undefined.
+#define _mm_inserti_si64(x, y, len, idx) \
+ ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
+ (__v2di)(__m128i)(y), \
+ (char)(len), (char)(idx)))
+
+/// Inserts bits of a specified length from the source integer vector
+/// \a __y into the lower 64 bits of the destination integer vector \a __x
+/// at the index and of the length specified by \a __y.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
+///
+/// \param __x
+/// The destination operand where bits will be inserted. The inserted bits
+/// are defined by the length and by the index of the least significant bit
+/// specified by operand \a __y.
+/// \param __y
+/// The source operand containing the bits to be extracted. The extracted
+/// bits are the least significant bits of operand \a __y with length
+/// specified by bits [69:64]. These are inserted into the destination at the
+/// index specified by bits [77:72]; all other bits are ignored. If bits
+/// [69:64] are zero, the length is interpreted as 64. If the sum of the
+/// index and length is greater than 64, the result is undefined. If the
+/// length and index are both zero, bits [63:0] of parameter \a __y are
+/// inserted into parameter \a __x. If the length is zero but the index is
+/// non-zero, the result is undefined.
+/// \returns A 128-bit integer vector containing the original lower 64-bits of
+/// destination operand \a __x with the specified bitfields replaced by the
+/// lower bits of source operand \a __y. The upper 64 bits of the return
+/// value are undefined.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_insert_si64(__m128i __x, __m128i __y)
+{
+ return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
+}
+
+/// Stores a 64-bit double-precision value in a 64-bit memory location.
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
+///
+/// \param __p
+/// The 64-bit memory location used to store the register value.
+/// \param __a
+/// The 64-bit double-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_sd(double *__p, __m128d __a)
+{
+ __builtin_ia32_movntsd(__p, (__v2df)__a);
+}
+
+/// Stores a 32-bit single-precision floating-point value in a 32-bit
+/// memory location. To minimize caching, the data is flagged as
+/// non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
+///
+/// \param __p
+/// The 32-bit memory location used to store the register value.
+/// \param __a
+/// The 32-bit single-precision floating-point register value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ss(float *__p, __m128 __a)
+{
+ __builtin_ia32_movntss(__p, (__v4sf)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AMMINTRIN_H */
--- /dev/null
+/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXINTRIN_H
+#define __AMXINTRIN_H
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS_TILE \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
+#define __DEFAULT_FN_ATTRS_INT8 \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
+#define __DEFAULT_FN_ATTRS_BF16 \
+ __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
+
+/// Load tile configuration from a 64-byte memory location specified by
+/// "mem_addr". The tile configuration includes the tile type palette, the
+/// number of bytes per row, and the number of rows. If the specified
+/// palette_id is zero, that signifies the init state for both the tile
+/// config and the tile data, and the tiles are zeroed. Any invalid
+/// configurations will result in #GP fault.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
+///
+/// \param __config
+/// A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_loadconfig(const void *__config) {
+ __builtin_ia32_tile_loadconfig(__config);
+}
+
+/// Stores the current tile configuration to a 64-byte memory location
+/// specified by "mem_addr". The tile configuration includes the tile type
+/// palette, the number of bytes per row, and the number of rows. If tiles
+/// are not configured, all zeroes will be stored to memory.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
+///
+/// \param __config
+/// A pointer to 512-bits configuration
+static __inline__ void __DEFAULT_FN_ATTRS_TILE
+_tile_storeconfig(void *__config) {
+ __builtin_ia32_tile_storeconfig(__config);
+}
+
+/// Release the tile configuration to return to the init state, which
+/// releases all storage it currently holds.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
+static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
+ __builtin_ia32_tilerelease();
+}
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+#define _tile_loadd(dst, base, stride) \
+ __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst" using the tile configuration previously configured
+/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+#define _tile_stream_loadd(dst, base, stride) \
+ __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
+ (__SIZE_TYPE__)(stride))
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride" using the tile configuration previously configured via
+/// "_tile_loadconfig".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be stored in memory.
+#define _tile_stored(dst, base, stride) \
+ __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
+
+/// Zero the tile specified by "tdest".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param tile
+/// The destination tile to be zero. Max size is 1024 Bytes.
+#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbssd(dst, src0, src1) \
+ __builtin_ia32_tdpbssd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbsud(dst, src0, src1) \
+ __builtin_ia32_tdpbsud((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbusd(dst, src0, src1) \
+ __builtin_ia32_tdpbusd((dst), (src0), (src1))
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbuud(dst, src0, src1) \
+ __builtin_ia32_tdpbuud((dst), (src0), (src1))
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpbf16ps(dst, src0, src1) \
+ __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
+
+/// AMX tile register size can be configured, the maximum size is 16x64=1024
+/// bytes. Since there is no 2D type in llvm IR, we use vector type to
+/// represent 2D tile and the fixed size is maximum amx tile register size.
+typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloadd64_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
+ __SIZE_TYPE__ stride) {
+ return __builtin_ia32_tileloaddt164_internal(m, n, base,
+ (__SIZE_TYPE__)(stride));
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
+_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ void __DEFAULT_FN_ATTRS_INT8
+_tile_stored_internal(unsigned short m, unsigned short n, void *base,
+ __SIZE_TYPE__ stride, _tile1024i tile) {
+ return __builtin_ia32_tilestored64_internal(m, n, base,
+ (__SIZE_TYPE__)(stride), tile);
+}
+
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
+_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+ _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+ return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
+}
+
+/// This struct pack the shape and tile data together for user. We suggest
+/// initializing the struct as early as possible, because compiler depends
+/// on the shape information to do configure. The constant value is preferred
+/// for optimization by compiler.
+typedef struct __tile1024i_str {
+ const unsigned short row;
+ const unsigned short col;
+ _tile1024i tile;
+} __tile1024i;
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
+ __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
+}
+
+/// Load tile rows from memory specifieid by "base" address and "stride" into
+/// destination tile "dst". This intrinsic provides a hint to the implementation
+/// that the data will likely not be reused in the near future and the data
+/// caching can be optimized accordingly.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be loaded in memory.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
+ __SIZE_TYPE__ stride) {
+ dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
+/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
+/// and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+/// Compute dot-product of bytes in tiles with a source/destination accumulator.
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
+/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
+/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
+/// "dst", and store the 32-bit result back to tile "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_INT8
+static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+/// Store the tile specified by "src" to memory specifieid by "base" address and
+/// "stride".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
+///
+/// \param dst
+/// A destination tile. Max size is 1024 Bytes.
+/// \param base
+/// A pointer to base address.
+/// \param stride
+/// The stride between the rows' data to be stored in memory.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
+ __tile1024i src) {
+ _tile_stored_internal(src.row, src.col, base, stride, src.tile);
+}
+
+/// Zero the tile specified by "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
+///
+/// \param dst
+/// The destination tile to be zero. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_TILE
+static __inline__ void __tile_zero(__tile1024i *dst) {
+ dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
+}
+
+/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
+///
+/// \param dst
+/// The destination tile. Max size is 1024 Bytes.
+/// \param src0
+/// The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+/// The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_BF16
+static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
+ __tile1024i src1) {
+ dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+ src0.tile, src1.tile);
+}
+
+#undef __DEFAULT_FN_ATTRS_TILE
+#undef __DEFAULT_FN_ATTRS_INT8
+#undef __DEFAULT_FN_ATTRS_BF16
+
+#endif /* __x86_64__ */
+#endif /* __AMXINTRIN_H */
--- /dev/null
+/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX2INTRIN_H
+#define __AVX2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
+
+/* SSE4 Multiple Packed Sums of Absolute Difference. */
+#define _mm256_mpsadbw_epu8(X, Y, M) \
+ ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+ (__v32qi)(__m256i)(Y), (int)(M)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_abs_epi8(__m256i __a)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
+#else
+ return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_abs_epi16(__m256i __a)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
+#else
+ return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_abs_epi32(__m256i __a)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
+#else
+ return (__m256i)__builtin_elementwise_abs((__v8si)__a);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+ return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v32qu)__a + (__v32qu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v16hu)__a + (__v16hu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8su)__a + (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a + (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_adds_epi8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
+#else
+ return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_adds_epi16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
+#else
+ return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_adds_epu8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
+#else
+ return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_adds_epu16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
+#else
+ return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
+#endif
+}
+
+#define _mm256_alignr_epi8(a, b, n) \
+ ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), (n)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a & (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
+ return (__m256i)(~(__v4du)__a & (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
+ return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
+ (__v32qi)__M);
+}
+
+#define _mm256_blend_epi16(V1, V2, M) \
+ ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+ (__v16hi)(__m256i)(V2), (int)(M)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v32qi)__a == (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v16hi)__a == (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8si)__a == (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4di)__a == (__v4di)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
+ /* This function always performs a signed comparison, but __v32qi is a char
+ which may be signed or unsigned, so use __v32qs. */
+ return (__m256i)((__v32qs)__a > (__v32qs)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v16hi)__a > (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8si)__a > (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4di)__a > (__v4di)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
+#else
+ return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
+#else
+ return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
+#else
+ return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
+#else
+ return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
+#else
+ return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
+#else
+ return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
+#else
+ return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
+#else
+ return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
+#else
+ return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
+#else
+ return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
+#else
+ return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
+#else
+ return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS256
+_mm256_movemask_epi8(__m256i __a)
+{
+ return __builtin_ia32_pmovmskb256((__v32qi)__a);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi8_epi16(__m128i __V)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi8_epi32(__m128i __V)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi8_epi64(__m128i __V)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi16_epi32(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi16_epi64(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi32_epi64(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepu8_epi16(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepu8_epi32(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepu8_epi64(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepu16_epi32(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepu16_epi64(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtepu32_epi64(__m128i __V)
+{
+ return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mul_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v16hu)__a * (__v16hu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8su)__a * (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mul_epu32(__m256i __a, __m256i __b)
+{
+ return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a | (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
+ return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
+}
+
+#define _mm256_shuffle_epi32(a, imm) \
+ ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
+
+#define _mm256_shufflehi_epi16(a, imm) \
+ ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
+
+#define _mm256_shufflelo_epi16(a, imm) \
+ ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_slli_si256(a, imm) \
+ ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
+
+#define _mm256_bslli_epi128(a, imm) \
+ ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_slli_epi16(__m256i __a, int __count)
+{
+ return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
+ return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_slli_epi32(__m256i __a, int __count)
+{
+ return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
+ return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_slli_epi64(__m256i __a, int __count)
+{
+ return __builtin_ia32_psllqi256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
+ return __builtin_ia32_psllq256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srai_epi16(__m256i __a, int __count)
+{
+ return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
+ return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srai_epi32(__m256i __a, int __count)
+{
+ return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
+ return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
+}
+
+#define _mm256_srli_si256(a, imm) \
+ ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
+
+#define _mm256_bsrli_epi128(a, imm) \
+ ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srli_epi16(__m256i __a, int __count)
+{
+ return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
+ return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srli_epi32(__m256i __a, int __count)
+{
+ return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
+ return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srli_epi64(__m256i __a, int __count)
+{
+ return __builtin_ia32_psrlqi256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
+ return __builtin_ia32_psrlq256((__v4di)__a, __count);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v32qu)__a - (__v32qu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v16hu)__a - (__v16hu)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8su)__a - (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a - (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_subs_epi8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
+#else
+ return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_subs_epi16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
+#else
+ return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_subs_epu8(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
+#else
+ return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_subs_epu16(__m256i __a, __m256i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
+#else
+ return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a ^ (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_stream_load_si256(__m256i const *__V)
+{
+ typedef __v4di __v4di_aligned __attribute__((aligned(32)));
+ return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_broadcastss_ps(__m128 __X)
+{
+ return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_broadcastsd_pd(__m128d __a)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_broadcastss_ps(__m128 __X)
+{
+ return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_broadcastsd_pd(__m128d __X)
+{
+ return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastsi128_si256(__m128i __X)
+{
+ return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
+}
+
+#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
+
+#define _mm_blend_epi32(V1, V2, M) \
+ ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+ (__v4si)(__m128i)(V2), (int)(M)))
+
+#define _mm256_blend_epi32(V1, V2, M) \
+ ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+ (__v8si)(__m256i)(V2), (int)(M)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastb_epi8(__m128i __X)
+{
+ return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastw_epi16(__m128i __X)
+{
+ return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastd_epi32(__m128i __X)
+{
+ return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastq_epi64(__m128i __X)
+{
+ return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastb_epi8(__m128i __X)
+{
+ return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastw_epi16(__m128i __X)
+{
+ return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastd_epi32(__m128i __X)
+{
+ return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastq_epi64(__m128i __X)
+{
+ return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_pd(V, M) \
+ ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
+{
+ return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
+}
+
+#define _mm256_permute4x64_epi64(V, M) \
+ ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
+
+#define _mm256_permute2x128_si256(V1, V2, M) \
+ ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
+
+#define _mm256_extracti128_si256(V, M) \
+ ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
+
+#define _mm256_inserti128_si256(V1, V2, M) \
+ ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+ (__v2di)(__m128i)(V2), (int)(M)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
+ return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
+ return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
+ return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
+ return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
+ __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
+ __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
+ __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+ __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
+}
+
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
+ ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+ (double const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v2df)(__m128d)(mask), (s)))
+
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
+ ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+ (double const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v4df)(__m256d)(mask), (s)))
+
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
+ ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+ (double const *)(m), \
+ (__v2di)(__m128i)(i), \
+ (__v2df)(__m128d)(mask), (s)))
+
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
+ ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+ (double const *)(m), \
+ (__v4di)(__m256i)(i), \
+ (__v4df)(__m256d)(mask), (s)))
+
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
+ ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+ (float const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v4sf)(__m128)(mask), (s)))
+
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
+ ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+ (float const *)(m), \
+ (__v8si)(__m256i)(i), \
+ (__v8sf)(__m256)(mask), (s)))
+
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
+ ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+ (float const *)(m), \
+ (__v2di)(__m128i)(i), \
+ (__v4sf)(__m128)(mask), (s)))
+
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
+ ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+ (float const *)(m), \
+ (__v4di)(__m256i)(i), \
+ (__v4sf)(__m128)(mask), (s)))
+
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
+ ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+ (int const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v4si)(__m128i)(mask), (s)))
+
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
+ ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+ (int const *)(m), \
+ (__v8si)(__m256i)(i), \
+ (__v8si)(__m256i)(mask), (s)))
+
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
+ ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+ (int const *)(m), \
+ (__v2di)(__m128i)(i), \
+ (__v4si)(__m128i)(mask), (s)))
+
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
+ ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+ (int const *)(m), \
+ (__v4di)(__m256i)(i), \
+ (__v4si)(__m128i)(mask), (s)))
+
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
+ ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+ (long long const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v2di)(__m128i)(mask), (s)))
+
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
+ ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+ (long long const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v4di)(__m256i)(mask), (s)))
+
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
+ ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+ (long long const *)(m), \
+ (__v2di)(__m128i)(i), \
+ (__v2di)(__m128i)(mask), (s)))
+
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
+ ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+ (long long const *)(m), \
+ (__v4di)(__m256i)(i), \
+ (__v4di)(__m256i)(mask), (s)))
+
+#define _mm_i32gather_pd(m, i, s) \
+ ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+ (double const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+ _mm_setzero_pd()), \
+ (s)))
+
+#define _mm256_i32gather_pd(m, i, s) \
+ ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+ (double const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+ _mm256_setzero_pd(), \
+ _CMP_EQ_OQ), \
+ (s)))
+
+#define _mm_i64gather_pd(m, i, s) \
+ ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+ (double const *)(m), \
+ (__v2di)(__m128i)(i), \
+ (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+ _mm_setzero_pd()), \
+ (s)))
+
+#define _mm256_i64gather_pd(m, i, s) \
+ ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+ (double const *)(m), \
+ (__v4di)(__m256i)(i), \
+ (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+ _mm256_setzero_pd(), \
+ _CMP_EQ_OQ), \
+ (s)))
+
+#define _mm_i32gather_ps(m, i, s) \
+ ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+ (float const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+ _mm_setzero_ps()), \
+ (s)))
+
+#define _mm256_i32gather_ps(m, i, s) \
+ ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+ (float const *)(m), \
+ (__v8si)(__m256i)(i), \
+ (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+ _mm256_setzero_ps(), \
+ _CMP_EQ_OQ), \
+ (s)))
+
+#define _mm_i64gather_ps(m, i, s) \
+ ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+ (float const *)(m), \
+ (__v2di)(__m128i)(i), \
+ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+ _mm_setzero_ps()), \
+ (s)))
+
+#define _mm256_i64gather_ps(m, i, s) \
+ ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+ (float const *)(m), \
+ (__v4di)(__m256i)(i), \
+ (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+ _mm_setzero_ps()), \
+ (s)))
+
+#define _mm_i32gather_epi32(m, i, s) \
+ ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+ (int const *)(m), (__v4si)(__m128i)(i), \
+ (__v4si)_mm_set1_epi32(-1), (s)))
+
+#define _mm256_i32gather_epi32(m, i, s) \
+ ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+ (int const *)(m), (__v8si)(__m256i)(i), \
+ (__v8si)_mm256_set1_epi32(-1), (s)))
+
+#define _mm_i64gather_epi32(m, i, s) \
+ ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+ (int const *)(m), (__v2di)(__m128i)(i), \
+ (__v4si)_mm_set1_epi32(-1), (s)))
+
+#define _mm256_i64gather_epi32(m, i, s) \
+ ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+ (int const *)(m), (__v4di)(__m256i)(i), \
+ (__v4si)_mm_set1_epi32(-1), (s)))
+
+#define _mm_i32gather_epi64(m, i, s) \
+ ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+ (long long const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v2di)_mm_set1_epi64x(-1), (s)))
+
+#define _mm256_i32gather_epi64(m, i, s) \
+ ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+ (long long const *)(m), \
+ (__v4si)(__m128i)(i), \
+ (__v4di)_mm256_set1_epi64x(-1), (s)))
+
+#define _mm_i64gather_epi64(m, i, s) \
+ ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+ (long long const *)(m), \
+ (__v2di)(__m128i)(i), \
+ (__v2di)_mm_set1_epi64x(-1), (s)))
+
+#define _mm256_i64gather_epi64(m, i, s) \
+ ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+ (long long const *)(m), \
+ (__v4di)(__m256i)(i), \
+ (__v4di)_mm256_set1_epi64x(-1), (s)))
+
+#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS128
+
+#endif /* __AVX2INTRIN_H */
--- /dev/null
+/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BF16INTRIN_H
+#define __AVX512BF16INTRIN_H
+
+#if (__clang_major__ > 15)
+typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __bfloat16;
+#else
+typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
+typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
+typedef unsigned short __bfloat16;
+#endif
+
+#define __DEFAULT_FN_ATTRS512 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
+ __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
+
+/// Convert One BF16 Data to One Single Float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __A
+/// A bfloat data.
+/// \returns A float data whose sign field and exponent field keep unchanged,
+/// and fraction field is extended to 23 bits.
+static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
+ return __builtin_ia32_cvtsbf162ss_32(__A);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [16 x float].
+/// \param __B
+/// A 512-bit vector of [16 x float].
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+/// conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
+ (__v16sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [16 x float].
+/// \param __B
+/// A 512-bit vector of [16 x float].
+/// \param __W
+/// A 512-bit vector of [32 x bfloat].
+/// \param __U
+/// A 32-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+/// conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
+ return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
+ (__v32hi)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [16 x float].
+/// \param __B
+/// A 512-bit vector of [16 x float].
+/// \param __U
+/// A 32-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
+/// conversion of __B, and higher 256 bits come from conversion of __A.
+static __inline__ __m512bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
+ return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [16 x float].
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_cvtneps_pbh(__m512 __A) {
+ return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+ (__v16hi)_mm256_undefined_si256(),
+ (__mmask16)-1);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [16 x float].
+/// \param __W
+/// A 256-bit vector of [16 x bfloat].
+/// \param __U
+/// A 16-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
+ return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+ (__v16hi)__W,
+ (__mmask16)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [16 x float].
+/// \param __U
+/// A 16-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
+ return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
+ (__v16hi)_mm256_setzero_si256(),
+ (__mmask16)__U);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [32 x bfloat].
+/// \param __B
+/// A 512-bit vector of [32 x bfloat].
+/// \param __D
+/// A 512-bit vector of [16 x float].
+/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
+ return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
+ (__v16si) __A,
+ (__v16si) __B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [32 x bfloat].
+/// \param __B
+/// A 512-bit vector of [32 x bfloat].
+/// \param __D
+/// A 512-bit vector of [16 x float].
+/// \param __U
+/// A 16-bit mask value specifying what is chosen for each element.
+/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
+ (__v16sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 512-bit vector of [32 x bfloat].
+/// \param __B
+/// A 512-bit vector of [32 x bfloat].
+/// \param __D
+/// A 512-bit vector of [16 x float].
+/// \param __U
+/// A 16-bit mask value specifying what is chosen for each element.
+/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
+ (__v16sf)_mm512_setzero_si512());
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+/// A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
+ return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
+ (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+/// A 16-bit mask. Elements are zeroed out when the corresponding mask
+/// bit is not set.
+/// \param __A
+/// A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
+ return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
+ (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+/// A 512-bit vector of [16 x float]. Elements are copied from __S when
+/// the corresponding mask bit is not set.
+/// \param __U
+/// A 16-bit mask.
+/// \param __A
+/// A 256-bit vector of [16 x bfloat].
+/// \returns A 512-bit vector of [16 x float] come from conversion of __A
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
+ return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
+ (__m512i)__S, (__mmask16)__U,
+ (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS512
+
+#endif
--- /dev/null
+/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BITALGINTRIN_H
+#define __AVX512BITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi16(__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
+ (__v32hi) _mm512_popcnt_epi16(__B),
+ (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
+{
+ return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
+ __U,
+ __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_popcnt_epi8(__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
+ (__v64qi) _mm512_popcnt_epi8(__B),
+ (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
+{
+ return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
+ __U,
+ __B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
+ (__v64qi) __B,
+ __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
+{
+ return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
+ __A,
+ __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512BWINTRIN_H
+#define __AVX512BWINTRIN_H
+
+typedef unsigned int __mmask32;
+typedef unsigned long long __mmask64;
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
+
+static __inline __mmask32 __DEFAULT_FN_ATTRS
+_knot_mask32(__mmask32 __M)
+{
+ return __builtin_ia32_knotsi(__M);
+}
+
+static __inline __mmask64 __DEFAULT_FN_ATTRS
+_knot_mask64(__mmask64 __M)
+{
+ return __builtin_ia32_knotdi(__M);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_kand_mask32(__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_kand_mask64(__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_kandn_mask32(__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_kandn_mask64(__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_kor_mask32(__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_kor_mask64(__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_kxnor_mask32(__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_kxnor_mask64(__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_kxor_mask32(__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_kxor_mask64(__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
+ return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
+ return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
+ return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
+ return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_kadd_mask32(__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_kadd_mask64(__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B);
+}
+
+#define _kshiftli_mask32(A, I) \
+ ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)))
+
+#define _kshiftri_mask32(A, I) \
+ ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)))
+
+#define _kshiftli_mask64(A, I) \
+ ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)))
+
+#define _kshiftri_mask64(A, I) \
+ ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_cvtmask32_u32(__mmask32 __A) {
+ return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_cvtmask64_u64(__mmask64 __A) {
+ return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_cvtu32_mask32(unsigned int __A) {
+ return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_cvtu64_mask64(unsigned long long __A) {
+ return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_load_mask32(__mmask32 *__A) {
+ return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_load_mask64(__mmask64 *__A) {
+ return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_store_mask32(__mmask32 *__A, __mmask32 __B) {
+ *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_store_mask64(__mmask64 *__A, __mmask64 __B) {
+ *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B);
+}
+
+/* Integer compare */
+
+#define _mm512_cmp_epi8_mask(a, b, p) \
+ ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), (int)(p), \
+ (__mmask64)-1))
+
+#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
+ ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), (int)(p), \
+ (__mmask64)(m)))
+
+#define _mm512_cmp_epu8_mask(a, b, p) \
+ ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), (int)(p), \
+ (__mmask64)-1))
+
+#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
+ ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
+ (__v64qi)(__m512i)(b), (int)(p), \
+ (__mmask64)(m)))
+
+#define _mm512_cmp_epi16_mask(a, b, p) \
+ ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), (int)(p), \
+ (__mmask32)-1))
+
+#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
+ ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), (int)(p), \
+ (__mmask32)(m)))
+
+#define _mm512_cmp_epu16_mask(a, b, p) \
+ ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), (int)(p), \
+ (__mmask32)-1))
+
+#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
+ ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
+ (__v32hi)(__m512i)(b), (int)(p), \
+ (__mmask32)(m)))
+
+#define _mm512_cmpeq_epi8_mask(A, B) \
+ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
+ _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi8_mask(A, B) \
+ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
+ _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi8_mask(A, B) \
+ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
+ _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi8_mask(A, B) \
+ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi8_mask(k, A, B) \
+ _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi8_mask(A, B) \
+ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
+ _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi8_mask(A, B) \
+ _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
+ _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm512_cmpeq_epu8_mask(A, B) \
+ _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
+ _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu8_mask(A, B) \
+ _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
+ _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu8_mask(A, B) \
+ _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
+ _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu8_mask(A, B) \
+ _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu8_mask(k, A, B) \
+ _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu8_mask(A, B) \
+ _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
+ _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu8_mask(A, B) \
+ _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
+ _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm512_cmpeq_epi16_mask(A, B) \
+ _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
+ _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi16_mask(A, B) \
+ _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
+ _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi16_mask(A, B) \
+ _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
+ _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi16_mask(A, B) \
+ _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi16_mask(k, A, B) \
+ _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi16_mask(A, B) \
+ _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
+ _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi16_mask(A, B) \
+ _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
+ _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm512_cmpeq_epu16_mask(A, B) \
+ _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
+ _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu16_mask(A, B) \
+ _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
+ _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu16_mask(A, B) \
+ _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
+ _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu16_mask(A, B) \
+ _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu16_mask(k, A, B) \
+ _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu16_mask(A, B) \
+ _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
+ _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu16_mask(A, B) \
+ _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
+ _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_add_epi8 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v64qu) __A + (__v64qu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_add_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_add_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sub_epi8 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v64qu) __A - (__v64qu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_sub_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_sub_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_add_epi16 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v32hu) __A + (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_add_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_add_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sub_epi16 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v32hu) __A - (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sub_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sub_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v32hu) __A * (__v32hu) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mullo_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mullo_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+ (__v64qi) __W,
+ (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+ (__v32hi) __W,
+ (__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_abs_epi8 (__m512i __A)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A);
+#else
+ return (__m512i)__builtin_elementwise_abs((__v64qs)__A);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_abs_epi8(__A),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_abs_epi8(__A),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_abs_epi16 (__m512i __A)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A);
+#else
+ return (__m512i)__builtin_elementwise_abs((__v32hi)__A);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_abs_epi16(__A),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_abs_epi16(__A),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_packs_epi32(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_packs_epi32(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_packs_epi32(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_packs_epi16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_packs_epi16(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_packs_epi16(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_packus_epi32(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_packus_epi32(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_packus_epi32(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_packus_epi16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_packus_epi16(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_packus_epi16(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
+#else
+ return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_adds_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_adds_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
+#else
+ return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_adds_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_adds_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
+#else
+ return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_adds_epu8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_adds_epu8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
+#else
+ return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_adds_epu16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_adds_epu16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_avg_epu8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_avg_epu8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_avg_epu16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_avg_epu16(__A, __B),
+ (__v32hi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_max_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_max_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_max_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_max_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_max_epu8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_max_epu8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_max_epu16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_max_epu16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_min_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_min_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_min_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_min_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_min_epu8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_min_epu8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_min_epu16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_min_epu16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_shuffle_epi8(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_shuffle_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_shuffle_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
+#else
+ return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_subs_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_subs_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
+#else
+ return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_subs_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_subs_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
+#else
+ return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_subs_epu8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_subs_epu8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
+#else
+ return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_subs_epu16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_subs_epu16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+ (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__U,
+ (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
+ (__v32hi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__U,
+ (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
+ (__v32hi)__I);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__U,
+ (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mulhrs_epi16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mulhrs_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mulhrs_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mulhi_epi16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mulhi_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mulhi_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mulhi_epu16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mulhi_epu16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_mulhi_epu16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
+ return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
+ __m512i __Y) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
+ (__v32hi)_mm512_maddubs_epi16(__X, __Y),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
+ (__v32hi)_mm512_maddubs_epi16(__X, __Y),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_madd_epi16(__m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_madd_epi16(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_madd_epi16(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtsepi16_epi8 (__m512i __A) {
+ return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+ (__v32qi)_mm256_setzero_si256(),
+ (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+ return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+ (__v32qi)__O,
+ __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
+ return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+ (__v32qi) _mm256_setzero_si256(),
+ __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtusepi16_epi8 (__m512i __A) {
+ return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+ (__v32qi) _mm256_setzero_si256(),
+ (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+ return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+ (__v32qi) __O,
+ __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
+ return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+ (__v32qi) _mm256_setzero_si256(),
+ __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi16_epi8 (__m512i __A) {
+ return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+ (__v32qi) _mm256_undefined_si256(),
+ (__mmask32) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
+ return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+ (__v32qi) __O,
+ __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
+ return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+ (__v32qi) _mm256_setzero_si256(),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
+ return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+ 8, 64+8, 9, 64+9,
+ 10, 64+10, 11, 64+11,
+ 12, 64+12, 13, 64+13,
+ 14, 64+14, 15, 64+15,
+ 24, 64+24, 25, 64+25,
+ 26, 64+26, 27, 64+27,
+ 28, 64+28, 29, 64+29,
+ 30, 64+30, 31, 64+31,
+ 40, 64+40, 41, 64+41,
+ 42, 64+42, 43, 64+43,
+ 44, 64+44, 45, 64+45,
+ 46, 64+46, 47, 64+47,
+ 56, 64+56, 57, 64+57,
+ 58, 64+58, 59, 64+59,
+ 60, 64+60, 61, 64+61,
+ 62, 64+62, 63, 64+63);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_unpackhi_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
+ return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+ 4, 32+4, 5, 32+5,
+ 6, 32+6, 7, 32+7,
+ 12, 32+12, 13, 32+13,
+ 14, 32+14, 15, 32+15,
+ 20, 32+20, 21, 32+21,
+ 22, 32+22, 23, 32+23,
+ 28, 32+28, 29, 32+29,
+ 30, 32+30, 31, 32+31);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_unpackhi_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
+ return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
+ 0, 64+0, 1, 64+1,
+ 2, 64+2, 3, 64+3,
+ 4, 64+4, 5, 64+5,
+ 6, 64+6, 7, 64+7,
+ 16, 64+16, 17, 64+17,
+ 18, 64+18, 19, 64+19,
+ 20, 64+20, 21, 64+21,
+ 22, 64+22, 23, 64+23,
+ 32, 64+32, 33, 64+33,
+ 34, 64+34, 35, 64+35,
+ 36, 64+36, 37, 64+37,
+ 38, 64+38, 39, 64+39,
+ 48, 64+48, 49, 64+49,
+ 50, 64+50, 51, 64+51,
+ 52, 64+52, 53, 64+53,
+ 54, 64+54, 55, 64+55);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
+ (__v64qi)_mm512_unpacklo_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
+ return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
+ 0, 32+0, 1, 32+1,
+ 2, 32+2, 3, 32+3,
+ 8, 32+8, 9, 32+9,
+ 10, 32+10, 11, 32+11,
+ 16, 32+16, 17, 32+17,
+ 18, 32+18, 19, 32+19,
+ 24, 32+24, 25, 32+25,
+ 26, 32+26, 27, 32+27);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_unpacklo_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi8_epi16(__m256i __A)
+{
+ /* This function always performs a signed extension, but __v32qi is a char
+ which may be signed or unsigned, so use __v32qs. */
+ return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepi8_epi16(__A),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepi8_epi16(__A),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu8_epi16(__m256i __A)
+{
+ return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepu8_epi16(__A),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_cvtepu8_epi16(__A),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+
+#define _mm512_shufflehi_epi16(A, imm) \
+ ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)))
+
+#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shufflehi_epi16((A), \
+ (imm)), \
+ (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_shufflehi_epi16(U, A, imm) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shufflehi_epi16((A), \
+ (imm)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+#define _mm512_shufflelo_epi16(A, imm) \
+ ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)))
+
+
+#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shufflelo_epi16((A), \
+ (imm)), \
+ (__v32hi)(__m512i)(W)))
+
+
+#define _mm512_maskz_shufflelo_epi16(U, A, imm) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shufflelo_epi16((A), \
+ (imm)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sllv_epi16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sllv_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sllv_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sll_epi16(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sll_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sll_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_slli_epi16(__m512i __A, unsigned int __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B);
+#else
+ return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_slli_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_slli_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+#define _mm512_bslli_epi128(a, imm) \
+ ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srlv_epi16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srlv_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srlv_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srav_epi16(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srav_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srav_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sra_epi16(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sra_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_sra_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srai_epi16(__m512i __A, unsigned int __B)
+{
+#if (__clang_major__ > 14)
+ return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B);
+#else
+ return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srai_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srai_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srl_epi16(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srl_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srl_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srli_epi16(__m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srli_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
+ (__v32hi)_mm512_srli_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+#define _mm512_bsrli_epi128(a, imm) \
+ ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+ (__v32hi) __A,
+ (__v32hi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
+ (__v32hi) __A,
+ (__v32hi) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+ (__v64qi) __A,
+ (__v64qi) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
+ (__v64qi) __A,
+ (__v64qi) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
+{
+ return (__m512i) __builtin_ia32_selectb_512(__M,
+ (__v64qi)_mm512_set1_epi8(__A),
+ (__v64qi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
+{
+ return (__m512i) __builtin_ia32_selectb_512(__M,
+ (__v64qi) _mm512_set1_epi8(__A),
+ (__v64qi) _mm512_setzero_si512());
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS
+_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+ (__mmask64) __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS
+_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+ (__mmask32) __B);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadu_epi16 (void const *__P)
+{
+ struct __loadu_epi16 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi16*)__P)->__v;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadu_epi8 (void const *__P)
+{
+ struct __loadu_epi8 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi8*)__P)->__v;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_storeu_epi16 (void *__P, __m512i __A)
+{
+ struct __storeu_epi16 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi16*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
+{
+ __builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
+ (__v32hi) __A,
+ (__mmask32) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_storeu_epi8 (void *__P, __m512i __A)
+{
+ struct __storeu_epi8 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi8*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
+{
+ __builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
+ (__v64qi) __A,
+ (__mmask64) __U);
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
+_mm512_test_epi8_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
+_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
+_mm512_test_epi16_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
+_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
+_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512());
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
+_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
+_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
+_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
+_mm512_movepi8_mask (__m512i __A)
+{
+ return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
+_mm512_movepi16_mask (__m512i __A)
+{
+ return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_movm_epi8 (__mmask64 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_movm_epi16 (__mmask32 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcastb_epi8 (__m128i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectb_512(__M,
+ (__v64qi) _mm512_broadcastb_epi8(__A),
+ (__v64qi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectb_512(__M,
+ (__v64qi) _mm512_broadcastb_epi8(__A),
+ (__v64qi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
+{
+ return (__m512i) __builtin_ia32_selectw_512(__M,
+ (__v32hi) _mm512_set1_epi16(__A),
+ (__v32hi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
+{
+ return (__m512i) __builtin_ia32_selectw_512(__M,
+ (__v32hi) _mm512_set1_epi16(__A),
+ (__v32hi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcastw_epi16 (__m128i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__M,
+ (__v32hi) _mm512_broadcastw_epi16(__A),
+ (__v32hi) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__M,
+ (__v32hi) _mm512_broadcastw_epi16(__A),
+ (__v32hi) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_permutexvar_epi16(__A, __B),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
+ (__v32hi)_mm512_permutexvar_epi16(__A, __B),
+ (__v32hi)__W);
+}
+
+#define _mm512_alignr_epi8(A, B, N) \
+ ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), (int)(N)))
+
+#define _mm512_mask_alignr_epi8(W, U, A, B, N) \
+ ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+ (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
+ (__v64qi)(__m512i)(W)))
+
+#define _mm512_maskz_alignr_epi8(U, A, B, N) \
+ ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+ (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
+ (__v64qi)(__m512i)_mm512_setzero_si512()))
+
+#define _mm512_dbsad_epu8(A, B, imm) \
+ ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
+ (__v32hi)(__m512i)(W)))
+
+#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sad_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
+ (__v64qi) __B);
+}
+
+#undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512CDINTRIN_H
+#define __AVX512CDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi64 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_conflict_epi64(__A),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_conflict_epi64(__A),
+ (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_conflict_epi32 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_conflict_epi32(__A),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_conflict_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_lzcnt_epi32(__A),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_lzcnt_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_lzcnt_epi64(__A),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_lzcnt_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+ return (__m512i) _mm512_set1_epi64((long long) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+ return (__m512i) _mm512_set1_epi32((int) __A);
+
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512DQINTRIN_H
+#define __AVX512DQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
+
+static __inline __mmask8 __DEFAULT_FN_ATTRS
+_knot_mask8(__mmask8 __M)
+{
+ return __builtin_ia32_knotqi(__M);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_kand_mask8(__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_kandn_mask8(__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_kor_mask8(__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_kxnor_mask8(__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_kxor_mask8(__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
+ return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
+ return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestchi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B);
+ return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_kadd_mask8(__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_kadd_mask16(__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B);
+}
+
+#define _kshiftli_mask8(A, I) \
+ ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)))
+
+#define _kshiftri_mask8(A, I) \
+ ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_cvtmask8_u32(__mmask8 __A) {
+ return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_cvtu32_mask8(unsigned int __A) {
+ return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS
+_load_mask8(__mmask8 *__A) {
+ return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_store_mask8(__mmask8 *__A, __mmask8 __B) {
+ *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_mullo_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_mullo_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_xor_pd(__m512d __A, __m512d __B) {
+ return (__m512d)((__v8du)__A ^ (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_xor_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_xor_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_xor_ps (__m512 __A, __m512 __B) {
+ return (__m512)((__v16su)__A ^ (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_xor_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_xor_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_or_pd(__m512d __A, __m512d __B) {
+ return (__m512d)((__v8du)__A | (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_or_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_or_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_or_ps(__m512 __A, __m512 __B) {
+ return (__m512)((__v16su)__A | (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_or_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_or_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_and_pd(__m512d __A, __m512d __B) {
+ return (__m512d)((__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_and_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_and_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_and_ps(__m512 __A, __m512 __B) {
+ return (__m512)((__v16su)__A & (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_and_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_and_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_andnot_pd(__m512d __A, __m512d __B) {
+ return (__m512d)(~(__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_andnot_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_andnot_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_andnot_ps(__m512 __A, __m512 __B) {
+ return (__m512)(~(__v16su)__A & (__v16su)__B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_andnot_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_andnot_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtpd_epi64 (__m512d __A) {
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi64(A, R) \
+ ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtpd_epu64 (__m512d __A) {
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu64(A, R) \
+ ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtps_epi64 (__m256 __A) {
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi64(A, R) \
+ ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtps_epu64 (__m256 __A) {
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu64(A, R) \
+ ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepi64_pd (__m512i __A) {
+ return (__m512d)__builtin_convertvector((__v8di)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_cvtepi64_pd(__A),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_cvtepi64_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_cvt_roundepi64_pd(A, R) \
+ ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
+ ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
+ ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_cvtepi64_ps (__m512i __A) {
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf) _mm256_setzero_ps(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf) _mm256_setzero_ps(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ps(A, R) \
+ ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
+ ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+ (__v8sf)(__m256)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
+ ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttpd_epi64 (__m512d __A) {
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi64(A, R) \
+ ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttpd_epu64 (__m512d __A) {
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epu64(A, R) \
+ ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttps_epi64 (__m256 __A) {
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi64(A, R) \
+ ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttps_epu64 (__m256 __A) {
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epu64(A, R) \
+ ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
+ (__v8di)_mm512_setzero_si512(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepu64_pd (__m512i __A) {
+ return (__m512d)__builtin_convertvector((__v8du)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_cvtepu64_pd(__A),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_cvtepu64_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_cvt_roundepu64_pd(A, R) \
+ ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
+ ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
+ ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_cvtepu64_ps (__m512i __A) {
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf) _mm256_setzero_ps(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf) _mm256_setzero_ps(),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ps(A, R) \
+ ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
+ ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+ (__v8sf)(__m256)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
+ ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_range_pd(A, B, C) \
+ ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_range_pd(W, U, A, B, C) \
+ ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_range_pd(U, A, B, C) \
+ ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_range_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_range_ps(A, B, C) \
+ ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_range_ps(W, U, A, B, C) \
+ ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)(__m512)(W), (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_range_ps(U, A, B, C) \
+ ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_range_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
+ ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)(__m512)(W), (__mmask16)(U), \
+ (int)(R)))
+
+#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm_range_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8) -1, (int)(C),\
+ (int)(R)))
+
+#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
+ ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W),\
+ (__mmask8)(U), (int)(C),\
+ (int)(R)))
+
+#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_range_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(C),\
+ (int)(R)))
+
+#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_range_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8) -1, (int)(C),\
+ (int)(R)))
+
+#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W),\
+ (__mmask8)(U), (int)(C),\
+ (int)(R)))
+
+#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_range_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(C),\
+ (int)(R)))
+
+#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_reduce_pd(A, B) \
+ ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_pd(W, U, A, B) \
+ ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_pd(U, A, B) \
+ ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_reduce_ps(A, B) \
+ ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ps(W, U, A, B) \
+ ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ps(U, A, B) \
+ ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_reduce_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_reduce_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm_reduce_ss(A, B, C) \
+ ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+ (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_ss(W, U, A, B, C) \
+ ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_ss(U, A, B, C) \
+ ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(C), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
+ (int)(C), (int)(R)))
+
+#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
+ ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(C), (int)(R)))
+
+#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(C), (int)(R)))
+
+#define _mm_reduce_sd(A, B, C) \
+ ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(C), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(C), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(C), (int)(R)))
+
+#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), (__mmask8)(U), \
+ (int)(C), (int)(R)))
+
+#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(C), (int)(R)))
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_movepi32_mask (__m512i __A)
+{
+ return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_movm_epi32 (__mmask16 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_movm_epi64 (__mmask8 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_movepi64_mask (__m512i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
+}
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_broadcast_f32x2 (__m128 __A)
+{
+ return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+ 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x2(__A),
+ (__v16sf)__O);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x2(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_broadcast_f32x8(__m256 __A)
+{
+ return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x8(__A),
+ (__v16sf)__O);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x8(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_broadcast_f64x2(__m128d __A)
+{
+ return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
+ 0, 1, 0, 1, 0, 1, 0, 1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+ (__v8df)_mm512_broadcast_f64x2(__A),
+ (__v8df)__O);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+ (__v8df)_mm512_broadcast_f64x2(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcast_i32x2 (__m128i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+ 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x2(__A),
+ (__v16si)__O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x2(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcast_i32x8(__m256i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x8(__A),
+ (__v16si)__O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x8(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcast_i64x2(__m128i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
+ 0, 1, 0, 1, 0, 1, 0, 1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_broadcast_i64x2(__A),
+ (__v8di)__O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_broadcast_i64x2(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_extractf32x8_ps(A, imm) \
+ ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v8sf)_mm256_undefined_ps(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
+ ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
+ ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm512_extractf64x2_pd(A, imm) \
+ ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_undefined_pd(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
+ ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+ (int)(imm), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
+ ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm512_extracti32x8_epi32(A, imm) \
+ ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v8si)_mm256_undefined_si256(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
+ ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
+ ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)(U)))
+
+#define _mm512_extracti64x2_epi64(A, imm) \
+ ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_undefined_si128(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+ (int)(imm), \
+ (__v2di)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_setzero_si128(), \
+ (__mmask8)(U)))
+
+#define _mm512_insertf32x8(A, B, imm) \
+ ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
+ (__v8sf)(__m256)(B), (int)(imm)))
+
+#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_insertf32x8(U, A, B, imm) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+#define _mm512_insertf64x2(A, B, imm) \
+ ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
+ (__v2df)(__m128d)(B), (int)(imm)))
+
+#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_insertf64x2(U, A, B, imm) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_inserti32x8(A, B, imm) \
+ ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
+ (__v8si)(__m256i)(B), (int)(imm)))
+
+#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+ (__v16si)(__m512i)(W)))
+
+#define _mm512_maskz_inserti32x8(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_inserti64x2(A, B, imm) \
+ ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
+ (__v2di)(__m128i)(B), (int)(imm)))
+
+#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+ (__v8di)(__m512i)(W)))
+
+#define _mm512_maskz_inserti64x2(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_mask_fpclass_ps_mask(U, A, imm) \
+ ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+ (int)(imm), (__mmask16)(U)))
+
+#define _mm512_fpclass_ps_mask(A, imm) \
+ ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
+ (int)(imm), (__mmask16)-1))
+
+#define _mm512_mask_fpclass_pd_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm512_fpclass_pd_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_fpclass_sd_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_fpclass_sd_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_fpclass_ss_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_fpclass_ss_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512ERINTRIN_H
+#define __AVX512ERINTRIN_H
+
+/* exp2a23 */
+#define _mm512_exp2a23_round_pd(A, R) \
+ ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
+ ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(S), (__mmask8)(M), \
+ (int)(R)))
+
+#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
+ ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm512_exp2a23_pd(A) \
+ _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(S, M, A) \
+ _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(M, A) \
+ _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_round_ps(A, R) \
+ ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
+ ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(S), (__mmask16)(M), \
+ (int)(R)))
+
+#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
+ ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(M), (int)(R)))
+
+#define _mm512_exp2a23_ps(A) \
+ _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(S, M, A) \
+ _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(M, A) \
+ _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+/* rsqrt28 */
+#define _mm512_rsqrt28_round_pd(A, R) \
+ ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
+ ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(S), (__mmask8)(M), \
+ (int)(R)))
+
+#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
+ ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm512_rsqrt28_pd(A) \
+ _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(S, M, A) \
+ _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(M, A) \
+ _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_round_ps(A, R) \
+ ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
+ ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(S), (__mmask16)(M), \
+ (int)(R)))
+
+#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
+ ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(M), (int)(R)))
+
+#define _mm512_rsqrt28_ps(A) \
+ _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(S, M, A) \
+ _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(M, A) \
+ _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
+ ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(S), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
+ ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_rsqrt28_ss(A, B) \
+ _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(S, M, A, B) \
+ _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(M, A, B) \
+ _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
+ ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(S), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
+ ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_rsqrt28_sd(A, B) \
+ _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(S, M, A, B) \
+ _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(M, A, B) \
+ _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+/* rcp28 */
+#define _mm512_rcp28_round_pd(A, R) \
+ ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
+ ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(S), (__mmask8)(M), \
+ (int)(R)))
+
+#define _mm512_maskz_rcp28_round_pd(M, A, R) \
+ ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm512_rcp28_pd(A) \
+ _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(S, M, A) \
+ _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(M, A) \
+ _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_round_ps(A, R) \
+ ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
+ ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(S), (__mmask16)(M), \
+ (int)(R)))
+
+#define _mm512_maskz_rcp28_round_ps(M, A, R) \
+ ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(M), (int)(R)))
+
+#define _mm512_rcp28_ps(A) \
+ _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(S, M, A) \
+ _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(M, A) \
+ _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
+ ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(S), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
+ ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_rcp28_ss(A, B) \
+ _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(S, M, A, B) \
+ _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(M, A, B) \
+ _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
+ ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(S), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
+ ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_rcp28_sd(A, B) \
+ _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_sd(S, M, A, B) \
+ _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(M, A, B) \
+ _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#endif /* __AVX512ERINTRIN_H */
--- /dev/null
+/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FINTRIN_H
+#define __AVX512FINTRIN_H
+
+typedef char __v64qi __attribute__((__vector_size__(64)));
+typedef short __v32hi __attribute__((__vector_size__(64)));
+typedef double __v8df __attribute__((__vector_size__(64)));
+typedef float __v16sf __attribute__((__vector_size__(64)));
+typedef long long __v8di __attribute__((__vector_size__(64)));
+typedef int __v16si __attribute__((__vector_size__(64)));
+
+/* Unsigned types */
+typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
+typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
+typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
+typedef unsigned int __v16su __attribute__((__vector_size__(64)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v64qs __attribute__((__vector_size__(64)));
+
+typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
+typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
+
+typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+
+/* Constants for integer comparison predicates */
+typedef enum {
+ _MM_CMPINT_EQ, /* Equal */
+ _MM_CMPINT_LT, /* Less than */
+ _MM_CMPINT_LE, /* Less than or Equal */
+ _MM_CMPINT_UNUSED,
+ _MM_CMPINT_NE, /* Not Equal */
+ _MM_CMPINT_NLT, /* Not Less than */
+#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */
+ _MM_CMPINT_NLE /* Not Less than or Equal */
+#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */
+} _MM_CMPINT_ENUM;
+
+typedef enum
+{
+ _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+ _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+ _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+ _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+ _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+ _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+ _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+ _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+ _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+ _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+ _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+ _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+ _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+ _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+ _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+ _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+ _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+ _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+ _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+ _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+ _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+ _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+ _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+ _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+ _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+ _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+ _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+ _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+ _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+ _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+ _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+ _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+ _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+ _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+ _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+ _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+ _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+ _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+ _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+ _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+ _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+ _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+ _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+ _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+ _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+ _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+ _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+ _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+ _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+ _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+ _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+ _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+ _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+ _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+ _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+ _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+ _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+ _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+ _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+ _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+ _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+ _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+ _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+ _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+ _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+ _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+ _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+ _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+ _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+ _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+ _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+ _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+ _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+ _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+ _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+ _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+ _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+ _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+ _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+ _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+ _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+ _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+ _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+ _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+ _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+ _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+typedef enum
+{
+ _MM_MANT_NORM_1_2, /* interval [1, 2) */
+ _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */
+ _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */
+ _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+ _MM_MANT_SIGN_src, /* sign = sign(SRC) */
+ _MM_MANT_SIGN_zero, /* sign = 0 */
+ _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
+
+/* Create vectors with repeated elements */
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_setzero_si512(void)
+{
+ return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+#define _mm512_setzero_epi32 _mm512_setzero_si512
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_undefined_pd(void)
+{
+ return (__m512d)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_undefined(void)
+{
+ return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_undefined_ps(void)
+{
+ return (__m512)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_undefined_epi32(void)
+{
+ return (__m512i)__builtin_ia32_undef512();
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__M,
+ (__v16si) _mm512_broadcastd_epi32(__A),
+ (__v16si) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__M,
+ (__v16si) _mm512_broadcastd_epi32(__A),
+ (__v16si) _mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di) _mm512_broadcastq_epi64(__A),
+ (__v8di) __O);
+
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di) _mm512_broadcastq_epi64(__A),
+ (__v8di) _mm512_setzero_si512());
+}
+
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_setzero_ps(void)
+{
+ return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+#define _mm512_setzero _mm512_setzero_ps
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_setzero_pd(void)
+{
+ return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_set1_ps(float __w)
+{
+ return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_set1_pd(double __w)
+{
+ return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set1_epi8(char __w)
+{
+ return __extension__ (__m512i)(__v64qi){
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set1_epi16(short __w)
+{
+ return __extension__ (__m512i)(__v32hi){
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set1_epi32(int __s)
+{
+ return __extension__ (__m512i)(__v16si){
+ __s, __s, __s, __s, __s, __s, __s, __s,
+ __s, __s, __s, __s, __s, __s, __s, __s };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__M,
+ (__v16si)_mm512_set1_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set1_epi64(long long __d)
+{
+ return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di)_mm512_set1_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_broadcastss_ps(__m128 __A)
+{
+ return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
+{
+ return __extension__ (__m512i)(__v16si)
+ { __D, __C, __B, __A, __D, __C, __B, __A,
+ __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set4_epi64 (long long __A, long long __B, long long __C,
+ long long __D)
+{
+ return __extension__ (__m512i) (__v8di)
+ { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+ return __extension__ (__m512d)
+ { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+ return __extension__ (__m512)
+ { __D, __C, __B, __A, __D, __C, __B, __A,
+ __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+#define _mm512_setr4_epi32(e0,e1,e2,e3) \
+ _mm512_set4_epi32((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_epi64(e0,e1,e2,e3) \
+ _mm512_set4_epi64((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_pd(e0,e1,e2,e3) \
+ _mm512_set4_pd((e3),(e2),(e1),(e0))
+
+#define _mm512_setr4_ps(e0,e1,e2,e3) \
+ _mm512_set4_ps((e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_broadcastsd_pd(__m128d __A)
+{
+ return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/* Cast between vector types */
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_castpd256_pd512(__m256d __a)
+{
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_castps256_ps512(__m256 __a)
+{
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
+ -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m128d __DEFAULT_FN_ATTRS512
+_mm512_castpd512_pd128(__m512d __a)
+{
+ return __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+static __inline __m256d __DEFAULT_FN_ATTRS512
+_mm512_castpd512_pd256 (__m512d __A)
+{
+ return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
+}
+
+static __inline __m128 __DEFAULT_FN_ATTRS512
+_mm512_castps512_ps128(__m512 __a)
+{
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
+}
+
+static __inline __m256 __DEFAULT_FN_ATTRS512
+_mm512_castps512_ps256 (__m512 __A)
+{
+ return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_castpd_ps (__m512d __A)
+{
+ return (__m512) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_castpd_si512 (__m512d __A)
+{
+ return (__m512i) (__A);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_castpd128_pd512 (__m128d __A)
+{
+ return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_castps_pd (__m512 __A)
+{
+ return (__m512d) (__A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_castps_si512 (__m512 __A)
+{
+ return (__m512i) (__A);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_castps128_ps512 (__m128 __A)
+{
+ return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castsi128_si512 (__m128i __A)
+{
+ return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castsi256_si512 (__m256i __A)
+{
+ return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_castsi512_ps (__m512i __A)
+{
+ return (__m512) (__A);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_castsi512_pd (__m512i __A)
+{
+ return (__m512d) (__A);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS512
+_mm512_castsi512_si128 (__m512i __A)
+{
+ return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS512
+_mm512_castsi512_si256 (__m512i __A)
+{
+ return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_int2mask(int __a)
+{
+ return (__mmask16)__a;
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_mask2int(__mmask16 __a)
+{
+ return (int)__a;
+}
+
+/// Constructs a 512-bit floating-point vector of [8 x double] from a
+/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
+/// contain the value of the source vector. The upper 384 bits are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
+/// contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_zextpd128_pd512(__m128d __a)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// Constructs a 512-bit floating-point vector of [8 x double] from a
+/// 256-bit floating-point vector of [4 x double]. The lower 256 bits
+/// contain the value of the source vector. The upper 256 bits are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
+/// contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_zextpd256_pd512(__m256d __a)
+{
+ return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// Constructs a 512-bit floating-point vector of [16 x float] from a
+/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+/// the value of the source vector. The upper 384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
+/// contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_zextps128_ps512(__m128 __a)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
+}
+
+/// Constructs a 512-bit floating-point vector of [16 x float] from a
+/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain
+/// the value of the source vector. The upper 256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
+/// contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_zextps256_ps512(__m256 __a)
+{
+ return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// Constructs a 512-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 384 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
+/// the parameter. The upper 384 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_zextsi128_si512(__m128i __a)
+{
+ return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
+}
+
+/// Constructs a 512-bit integer vector from a 256-bit integer vector.
+/// The lower 256 bits contain the value of the source vector. The upper
+/// 256 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
+/// the parameter. The upper 256 bits are set to zero.
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_zextsi256_si512(__m256i __a)
+{
+ return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/* Bitwise operators */
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_and_epi32(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v16su)__a & (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+ (__v16si) _mm512_and_epi32(__a, __b),
+ (__v16si) __src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
+ __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_and_epi64(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v8du)__a & (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
+ (__v8di) _mm512_and_epi64(__a, __b),
+ (__v8di) __src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
+ __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_andnot_si512 (__m512i __A, __m512i __B)
+{
+ return (__m512i)(~(__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i)(~(__v16su)__A & (__v16su)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_andnot_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
+ __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_andnot_epi64(__m512i __A, __m512i __B)
+{
+ return (__m512i)(~(__v8du)__A & (__v8du)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_andnot_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
+ __U, __A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_or_epi32(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v16su)__a | (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+ (__v16si)_mm512_or_epi32(__a, __b),
+ (__v16si)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_or_epi64(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v8du)__a | (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+ (__v8di)_mm512_or_epi64(__a, __b),
+ (__v8di)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_xor_epi32(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v16su)__a ^ (__v16su)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
+ (__v16si)_mm512_xor_epi32(__a, __b),
+ (__v16si)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_xor_epi64(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v8du)__a ^ (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
+ (__v8di)_mm512_xor_epi64(__a, __b),
+ (__v8di)__src);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
+{
+ return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_and_si512(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v8du)__a & (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_or_si512(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v8du)__a | (__v8du)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_xor_si512(__m512i __a, __m512i __b)
+{
+ return (__m512i)((__v8du)__a ^ (__v8du)__b);
+}
+
+/* Arithmetic */
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_add_pd(__m512d __a, __m512d __b)
+{
+ return (__m512d)((__v8df)__a + (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_add_ps(__m512 __a, __m512 __b)
+{
+ return (__m512)((__v16sf)__a + (__v16sf)__b);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_mul_pd(__m512d __a, __m512d __b)
+{
+ return (__m512d)((__v8df)__a * (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_mul_ps(__m512 __a, __m512 __b)
+{
+ return (__m512)((__v16sf)__a * (__v16sf)__b);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_sub_pd(__m512d __a, __m512d __b)
+{
+ return (__m512d)((__v8df)__a - (__v8df)__b);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_sub_ps(__m512 __a, __m512 __b)
+{
+ return (__m512)((__v16sf)__a - (__v16sf)__b);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A + (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_add_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_add_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A - (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sub_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sub_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A + (__v16su) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_add_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_add_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A - (__v16su) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sub_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sub_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+#define _mm512_max_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(R)))
+
+#define _mm512_mask_max_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_max_round_pd((A), (B), (R)), \
+ (__v8df)(W)))
+
+#define _mm512_maskz_max_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_max_round_pd((A), (B), (R)), \
+ (__v8df)_mm512_setzero_pd()))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_max_pd(__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_max_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_max_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_max_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(R)))
+
+#define _mm512_mask_max_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
+ (__v16sf)(W)))
+
+#define _mm512_maskz_max_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_max_ps(__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_max_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_max_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_max_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm_maskz_max_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_max_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_max_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS512
+_mm512_max_epi32(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_max_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_max_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_max_epu32(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_max_epu32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_max_epu32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_max_epi64(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_max_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_max_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_max_epu64(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B);
+#else
+ return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_max_epu64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_max_epu64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_min_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(R)))
+
+#define _mm512_mask_min_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_min_round_pd((A), (B), (R)), \
+ (__v8df)(W)))
+
+#define _mm512_maskz_min_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_min_round_pd((A), (B), (R)), \
+ (__v8df)_mm512_setzero_pd()))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_min_pd(__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_min_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_min_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_min_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(R)))
+
+#define _mm512_mask_min_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
+ (__v16sf)(W)))
+
+#define _mm512_maskz_min_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_min_ps(__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_min_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_min_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_min_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm_maskz_min_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_min_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_min_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline __m512i
+__DEFAULT_FN_ATTRS512
+_mm512_min_epi32(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_min_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_min_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_min_epu32(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_min_epu32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_min_epu32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_min_epi64(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_min_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_min_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_min_epu64(__m512i __A, __m512i __B)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B);
+#else
+ return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_min_epu64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_min_epu64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mul_epi32(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epi32(__X, __Y),
+ (__v8di)__W);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epi32(__X, __Y),
+ (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mul_epu32(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epu32(__X, __Y),
+ (__v8di)__W);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_mul_epu32(__X, __Y),
+ (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A * (__v16su) __B);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_mullo_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_mullo_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mullox_epi64 (__m512i __A, __m512i __B) {
+ return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_mullox_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+#define _mm512_sqrt_round_pd(A, R) \
+ ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
+
+#define _mm512_mask_sqrt_round_pd(W, U, A, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_sqrt_round_pd((A), (R)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_sqrt_round_pd(U, A, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_sqrt_round_pd((A), (R)), \
+ (__v8df)_mm512_setzero_pd()))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_sqrt_pd(__m512d __A)
+{
+ return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_sqrt_pd(__A),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_sqrt_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_sqrt_round_ps(A, R) \
+ ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
+
+#define _mm512_mask_sqrt_round_ps(W, U, A, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_sqrt_round_ps(U, A, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_sqrt_ps(__m512 __A)
+{
+ return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_sqrt_ps(__A),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_sqrt_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_rsqrt14_pd(__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_rsqrt14_ps(__m512 __A)
+{
+ return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_rsqrt14_ss(__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_rsqrt14_sd(__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_rcp14_pd(__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_rcp14_ps(__m512 __A)
+{
+ return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_rcp14_ss(__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_rcp14_sd(__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_floor_ps(__m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_FLOOR,
+ (__v16sf) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_FLOOR,
+ (__v16sf) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_floor_pd(__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_FLOOR,
+ (__v8df) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_FLOOR,
+ (__v8df) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_CEIL,
+ (__v16sf) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_ceil_ps(__m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_CEIL,
+ (__v16sf) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_ceil_pd(__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_CEIL,
+ (__v8df) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_CEIL,
+ (__v8df) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_abs_epi64(__m512i __A)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pabsq512((__v8di)__A);
+#else
+ return (__m512i)__builtin_elementwise_abs((__v8di)__A);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_abs_epi64(__A),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_abs_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_abs_epi32(__m512i __A)
+{
+#if (__clang_major__ < 14)
+ return (__m512i)__builtin_ia32_pabsd512((__v16si) __A);
+#else
+ return (__m512i)__builtin_elementwise_abs((__v16si) __A);
+#endif
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_abs_epi32(__A),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_abs_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_add_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_add_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
+}
+
+#define _mm_add_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_add_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm_maskz_add_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_add_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_add_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
+}
+#define _mm_add_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_add_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_add_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_add_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_add_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_add_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_add_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_add_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(R)))
+
+#define _mm512_mask_add_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_add_round_pd((A), (B), (R)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_add_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_add_round_pd((A), (B), (R)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_add_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(R)))
+
+#define _mm512_mask_add_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_add_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_sub_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_sub_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
+}
+#define _mm_sub_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_sub_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm_maskz_sub_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_sub_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_sub_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
+}
+
+#define _mm_sub_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_sub_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_sub_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_sub_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_sub_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_sub_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_sub_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_sub_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(R)))
+
+#define _mm512_mask_sub_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_sub_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_sub_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(R)))
+
+#define _mm512_mask_sub_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_sub_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_mul_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_mul_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
+}
+#define _mm_mul_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_mul_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm_maskz_mul_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_mul_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_mul_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
+}
+
+#define _mm_mul_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_mul_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_mul_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_mul_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_mul_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_mul_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_mul_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_mul_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(R)))
+
+#define _mm512_mask_mul_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_mul_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_mul_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(R)))
+
+#define _mm512_mask_mul_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_mul_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_div_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
+ __A = _mm_div_ss(__A, __B);
+ return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
+}
+
+#define _mm_div_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_div_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm_maskz_div_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_div_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
+ __A = _mm_div_sd(__A, __B);
+ return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
+}
+
+#define _mm_div_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_div_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_div_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_div_pd(__m512d __a, __m512d __b)
+{
+ return (__m512d)((__v8df)__a/(__v8df)__b);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_div_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_div_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_div_ps(__m512 __a, __m512 __b)
+{
+ return (__m512)((__v16sf)__a/(__v16sf)__b);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_div_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_div_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+#define _mm512_div_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(R)))
+
+#define _mm512_mask_div_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_div_round_pd((A), (B), (R)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_div_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_div_round_pd((A), (B), (R)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_div_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(R)))
+
+#define _mm512_mask_div_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_div_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+#define _mm512_roundscale_ps(A, B) \
+ ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ps(A, B, C, imm) \
+ ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+ (__v16sf)(__m512)(A), (__mmask16)(B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ps(A, B, imm) \
+ ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(A), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
+ ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
+ (__v16sf)(__m512)(A), (__mmask16)(B), \
+ (int)(R)))
+
+#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
+ ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(A), (int)(R)))
+
+#define _mm512_roundscale_round_ps(A, imm, R) \
+ ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_roundscale_pd(A, B) \
+ ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
+ (__v8df)_mm512_undefined_pd(), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_pd(A, B, C, imm) \
+ ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+ (__v8df)(__m512d)(A), (__mmask8)(B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_pd(A, B, imm) \
+ ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(A), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
+ ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
+ (__v8df)(__m512d)(A), (__mmask8)(B), \
+ (int)(R)))
+
+#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
+ ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(A), (int)(R)))
+
+#define _mm512_roundscale_round_pd(A, imm, R) \
+ ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
+ (__v8df)_mm512_undefined_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_fmadd_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)-1, (int)(R)))
+
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_fmsub_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)-1, (int)(R)))
+
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_fnmadd_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)-1, (int)(R)))
+
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_fnmsub_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)-1, (int)(R)))
+
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ -(__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ -(__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)-1, (int)(R)))
+
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_fmsub_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)-1, (int)(R)))
+
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_fnmadd_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ -(__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)-1, (int)(R)))
+
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_fnmsub_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ -(__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)-1, (int)(R)))
+
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ -(__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ -(__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)-1, (int)(R)))
+
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
+ ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)-1, (int)(R)))
+
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)-1, (int)(R)))
+
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
+ ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)-1, (int)(R)))
+
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
+ ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
+ ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
+ ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
+ ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+ -(__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ -(__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ -(__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ -(__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
+ ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
+ -(__v8df)(__m512d)(B), \
+ -(__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
+ ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ -(__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
+ ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
+ -(__v16sf)(__m512)(B), \
+ -(__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
+ ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(C), \
+ (__mmask16)(U), (int)(R)))
+
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ -(__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+
+
+/* Vector permutations */
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
+ (__v16si) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
+ (__v16si)__I);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
+ (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
+ (__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
+ (__v8di)__I);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_alignr_epi64(A, B, I) \
+ ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+ (__v8di)(__m512i)(W)))
+
+#define _mm512_maskz_alignr_epi64(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_alignr_epi32(A, B, I) \
+ ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+ (__v16si)(__m512i)(W)))
+
+#define _mm512_maskz_alignr_epi32(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()))
+/* Vector Extract */
+
+#define _mm512_extractf64x4_pd(A, I) \
+ ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
+ (__v4df)_mm256_undefined_pd(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
+ ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
+ ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm512_extractf32x4_ps(A, I) \
+ ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
+ (__v4sf)_mm_undefined_ps(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
+ ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
+ ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U)))
+
+/* Vector Blend */
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
+{
+ return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+ (__v8df) __W,
+ (__v8df) __A);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
+{
+ return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+ (__v16sf) __W,
+ (__v16sf) __A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+ (__v8di) __W,
+ (__v8di) __A);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+ (__v16si) __W,
+ (__v16si) __A);
+}
+
+/* Compare */
+
+#define _mm512_cmp_round_ps_mask(A, B, P, R) \
+ ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(P), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
+ ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(P), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_cmp_ps_mask(A, B, P) \
+ _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
+ _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmpeq_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_ps_mask(A, B) \
+ _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_ps_mask(k, A, B) \
+ _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
+
+#define _mm512_cmp_round_pd_mask(A, B, P, R) \
+ ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(P), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
+ ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(P), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_cmp_pd_mask(A, B, P) \
+ _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
+ _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_cmpeq_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
+#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
+
+#define _mm512_cmplt_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
+#define _mm512_mask_cmplt_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
+
+#define _mm512_cmple_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
+#define _mm512_mask_cmple_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
+
+#define _mm512_cmpunord_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
+#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
+
+#define _mm512_cmpneq_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
+#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
+
+#define _mm512_cmpnlt_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
+#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
+
+#define _mm512_cmpnle_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
+#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
+
+#define _mm512_cmpord_pd_mask(A, B) \
+ _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
+#define _mm512_mask_cmpord_pd_mask(k, A, B) \
+ _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
+
+/* Conversion */
+
+#define _mm512_cvtt_roundps_epu32(A, R) \
+ ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_undefined_epi32(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)(__m512i)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)(U), (int)(R)))
+
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttps_epu32(__m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si) _mm512_setzero_si512 (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ps(A, R) \
+ ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
+ ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
+ ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_cvt_roundepu32_ps(A, R) \
+ ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
+ ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
+ ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32_ps (__m512i __A)
+{
+ return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_cvtepu32_ps(__A),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_cvtepu32_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_pd(__m256i __A)
+{
+ return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepi32_pd(__A),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepi32_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32lo_pd(__m512i __A)
+{
+ return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+ return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_ps (__m512i __A)
+{
+ return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_cvtepi32_ps(__A),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_cvtepi32_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32_pd(__m256i __A)
+{
+ return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepu32_pd(__A),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_cvtepu32_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32lo_pd(__m512i __A)
+{
+ return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
+{
+ return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
+}
+
+#define _mm512_cvt_roundpd_ps(A, R) \
+ ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
+ ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+ (__v8sf)(__m256)(W), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
+ ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_cvtpd_ps (__m512d __A)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf) _mm256_undefined_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf) _mm256_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_cvtpd_pslo (__m512d __A)
+{
+ return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
+ (__v8sf) _mm256_setzero_ps (),
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
+{
+ return (__m512) __builtin_shufflevector (
+ (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
+ __U, __A),
+ (__v8sf) _mm256_setzero_ps (),
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+#define _mm512_cvt_roundps_ph(A, I) \
+ ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+ (__v16hi)_mm256_undefined_si256(), \
+ (__mmask16)-1))
+
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
+ ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+ (__v16hi)(__m256i)(U), \
+ (__mmask16)(W)))
+
+#define _mm512_maskz_cvt_roundps_ph(W, A, I) \
+ ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
+ (__v16hi)_mm256_setzero_si256(), \
+ (__mmask16)(W)))
+
+#define _mm512_cvtps_ph _mm512_cvt_roundps_ph
+#define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph
+#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
+
+#define _mm512_cvt_roundph_ps(A, R) \
+ ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
+ ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_ps(U, A, R) \
+ ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_cvtph_ps(__m256i __A)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundpd_epi32(A, R) \
+ ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
+ ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
+ ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvttpd_epi32(__m512d __a)
+{
+ return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
+ (__v8si)_mm256_setzero_si256(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si) _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundps_epi32(A, R) \
+ ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)(__m512i)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
+ ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttps_epi32(__m512 __a)
+{
+ return (__m512i)
+ __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
+ (__v16si) _mm512_setzero_si512 (),
+ (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si) _mm512_setzero_si512 (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epi32(A, R) \
+ ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)(__m512i)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtps_epi32 (__m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si) _mm512_undefined_epi32 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epi32(A, R) \
+ ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
+ ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
+ ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtpd_epi32 (__m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundps_epu32(A, R) \
+ ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)(__m512i)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
+ ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
+ (__v16si)_mm512_setzero_si512(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtps_epu32 ( __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
+ (__v16si)\
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1,\
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U ,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundpd_epu32(A, R) \
+ ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
+ ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
+ ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtpd_epu32 (__m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_cvtsd_f64(__m512d __a)
+{
+ return __a[0];
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_cvtss_f32(__m512 __a)
+{
+ return __a[0];
+}
+
+/* Unpack and Interleave */
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_unpackhi_pd(__m512d __a, __m512d __b)
+{
+ return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+ 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_unpackhi_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_unpackhi_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_unpacklo_pd(__m512d __a, __m512d __b)
+{
+ return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
+ 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_unpacklo_pd(__A, __B),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
+ (__v8df)_mm512_unpacklo_pd(__A, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_unpackhi_ps(__m512 __a, __m512 __b)
+{
+ return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+ 2, 18, 3, 19,
+ 2+4, 18+4, 3+4, 19+4,
+ 2+8, 18+8, 3+8, 19+8,
+ 2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+ (__v16sf)_mm512_unpackhi_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+ (__v16sf)_mm512_unpackhi_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_unpacklo_ps(__m512 __a, __m512 __b)
+{
+ return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
+ 0, 16, 1, 17,
+ 0+4, 16+4, 1+4, 17+4,
+ 0+8, 16+8, 1+8, 17+8,
+ 0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+ (__v16sf)_mm512_unpacklo_ps(__A, __B),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
+ (__v16sf)_mm512_unpacklo_ps(__A, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+ 2, 18, 3, 19,
+ 2+4, 18+4, 3+4, 19+4,
+ 2+8, 18+8, 3+8, 19+8,
+ 2+12, 18+12, 3+12, 19+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+ (__v16si)_mm512_unpackhi_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+ (__v16si)_mm512_unpackhi_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
+ 0, 16, 1, 17,
+ 0+4, 16+4, 1+4, 17+4,
+ 0+8, 16+8, 1+8, 17+8,
+ 0+12, 16+12, 1+12, 17+12);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+ (__v16si)_mm512_unpacklo_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
+ (__v16si)_mm512_unpacklo_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+ 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+ (__v8di)_mm512_unpackhi_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+ (__v8di)_mm512_unpackhi_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
+ 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+ (__v8di)_mm512_unpacklo_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
+ (__v8di)_mm512_unpacklo_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+
+/* SIMD load ops */
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadu_si512 (void const *__P)
+{
+ struct __loadu_si512 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_si512*)__P)->__v;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadu_epi32 (void const *__P)
+{
+ struct __loadu_epi32 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi32*)__P)->__v;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_loadu_epi64 (void const *__P)
+{
+ struct __loadu_epi64 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi64*)__P)->__v;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_loadu_pd(void const *__p)
+{
+ struct __loadu_pd {
+ __m512d_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pd*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_loadu_ps(void const *__p)
+{
+ struct __loadu_ps {
+ __m512_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_ps*)__p)->__v;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_load_ps(void const *__p)
+{
+ return *(const __m512*)__p;
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_load_pd(void const *__p)
+{
+ return *(const __m512d*)__p;
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_load_si512 (void const *__P)
+{
+ return *(const __m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_load_epi32 (void const *__P)
+{
+ return *(const __m512i *) __P;
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_load_epi64 (void const *__P)
+{
+ return *(const __m512i *) __P;
+}
+
+/* SIMD store ops */
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_storeu_epi64 (void *__P, __m512i __A)
+{
+ struct __storeu_epi64 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi64*)__P)->__v = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
+{
+ __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+ struct __storeu_si512 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_si512*)__P)->__v = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_storeu_epi32 (void *__P, __m512i __A)
+{
+ struct __storeu_epi32 {
+ __m512i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi32*)__P)->__v = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
+{
+ __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
+ (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+ __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_storeu_pd(void *__P, __m512d __A)
+{
+ struct __storeu_pd {
+ __m512d_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pd*)__P)->__v = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+ __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_storeu_ps(void *__P, __m512 __A)
+{
+ struct __storeu_ps {
+ __m512_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_ps*)__P)->__v = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
+{
+ __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_store_pd(void *__P, __m512d __A)
+{
+ *(__m512d*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
+{
+ __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_store_ps(void *__P, __m512 __A)
+{
+ *(__m512*)__P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_store_si512 (void *__P, __m512i __A)
+{
+ *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_store_epi32 (void *__P, __m512i __A)
+{
+ *(__m512i *) __P = __A;
+}
+
+static __inline void __DEFAULT_FN_ATTRS512
+_mm512_store_epi64 (void *__P, __m512i __A)
+{
+ *(__m512i *) __P = __A;
+}
+
+/* Mask ops */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS
+_mm512_knot(__mmask16 __M)
+{
+ return __builtin_ia32_knothi(__M);
+}
+
+/* Integer compare */
+
+#define _mm512_cmpeq_epi32_mask(A, B) \
+ _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
+ _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi32_mask(A, B) \
+ _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi32_mask(k, A, B) \
+ _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi32_mask(A, B) \
+ _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
+ _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi32_mask(A, B) \
+ _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi32_mask(k, A, B) \
+ _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi32_mask(A, B) \
+ _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi32_mask(k, A, B) \
+ _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi32_mask(A, B) \
+ _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
+ _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm512_cmpeq_epu32_mask(A, B) \
+ _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
+ _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu32_mask(A, B) \
+ _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu32_mask(k, A, B) \
+ _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu32_mask(A, B) \
+ _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
+ _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu32_mask(A, B) \
+ _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu32_mask(k, A, B) \
+ _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu32_mask(A, B) \
+ _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu32_mask(k, A, B) \
+ _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu32_mask(A, B) \
+ _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
+ _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm512_cmpeq_epi64_mask(A, B) \
+ _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
+ _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epi64_mask(A, B) \
+ _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epi64_mask(k, A, B) \
+ _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epi64_mask(A, B) \
+ _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
+ _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epi64_mask(A, B) \
+ _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epi64_mask(k, A, B) \
+ _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epi64_mask(A, B) \
+ _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epi64_mask(k, A, B) \
+ _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epi64_mask(A, B) \
+ _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
+ _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm512_cmpeq_epu64_mask(A, B) \
+ _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
+ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm512_cmpge_epu64_mask(A, B) \
+ _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm512_mask_cmpge_epu64_mask(k, A, B) \
+ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm512_cmpgt_epu64_mask(A, B) \
+ _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
+ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm512_cmple_epu64_mask(A, B) \
+ _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm512_mask_cmple_epu64_mask(k, A, B) \
+ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm512_cmplt_epu64_mask(A, B) \
+ _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm512_mask_cmplt_epu64_mask(k, A, B) \
+ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm512_cmpneq_epu64_mask(A, B) \
+ _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
+ _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi8_epi32(__m128i __A)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi8_epi32(__A),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi8_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi8_epi64(__m128i __A)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi8_epi64(__A),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi8_epi64(__A),
+ (__v8di)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_epi64(__m256i __X)
+{
+ return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi32_epi64(__X),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi32_epi64(__X),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi16_epi32(__m256i __A)
+{
+ return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi16_epi32(__A),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepi16_epi32(__A),
+ (__v16si)_mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi16_epi64(__m128i __A)
+{
+ return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi16_epi64(__A),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepi16_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu8_epi32(__m128i __A)
+{
+ return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu8_epi32(__A),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu8_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu8_epi64(__m128i __A)
+{
+ return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu8_epi64(__A),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu8_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32_epi64(__m256i __X)
+{
+ return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu32_epi64(__X),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu32_epi64(__X),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu16_epi32(__m256i __A)
+{
+ return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu16_epi32(__A),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_cvtepu16_epi32(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtepu16_epi64(__m128i __A)
+{
+ return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu16_epi64(__A),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_cvtepu16_epi64(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_rorv_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_rorv_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_rorv_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_rorv_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_rorv_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_rorv_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+
+
+#define _mm512_cmp_epi32_mask(a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+ (__v16si)(__m512i)(b), (int)(p), \
+ (__mmask16)-1))
+
+#define _mm512_cmp_epu32_mask(a, b, p) \
+ ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+ (__v16si)(__m512i)(b), (int)(p), \
+ (__mmask16)-1))
+
+#define _mm512_cmp_epi64_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+ (__v8di)(__m512i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm512_cmp_epu64_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+ (__v8di)(__m512i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
+ (__v16si)(__m512i)(b), (int)(p), \
+ (__mmask16)(m)))
+
+#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
+ ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
+ (__v16si)(__m512i)(b), (int)(p), \
+ (__mmask16)(m)))
+
+#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
+ (__v8di)(__m512i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
+ (__v8di)(__m512i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm512_rol_epi32(a, b) \
+ ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
+
+#define _mm512_mask_rol_epi32(W, U, a, b) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_rol_epi32((a), (b)), \
+ (__v16si)(__m512i)(W)))
+
+#define _mm512_maskz_rol_epi32(U, a, b) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_rol_epi32((a), (b)), \
+ (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_rol_epi64(a, b) \
+ ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
+
+#define _mm512_mask_rol_epi64(W, U, a, b) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_rol_epi64((a), (b)), \
+ (__v8di)(__m512i)(W)))
+
+#define _mm512_maskz_rol_epi64(U, a, b) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_rol_epi64((a), (b)), \
+ (__v8di)_mm512_setzero_si512()))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_rolv_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_rolv_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_rolv_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_rolv_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_rolv_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_rolv_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_ror_epi32(A, B) \
+ ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
+
+#define _mm512_mask_ror_epi32(W, U, A, B) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_ror_epi32((A), (B)), \
+ (__v16si)(__m512i)(W)))
+
+#define _mm512_maskz_ror_epi32(U, A, B) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_ror_epi32((A), (B)), \
+ (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_ror_epi64(A, B) \
+ ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
+
+#define _mm512_mask_ror_epi64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_ror_epi64((A), (B)), \
+ (__v8di)(__m512i)(W)))
+
+#define _mm512_maskz_ror_epi64(U, A, B) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_ror_epi64((A), (B)), \
+ (__v8di)_mm512_setzero_si512()))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_slli_epi32(__m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_slli_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_slli_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_slli_epi64(__m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_slli_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_slli_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srli_epi32(__m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srli_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srli_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srli_epi64(__m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srli_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srli_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+ __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+ (__v16si) __A,
+ (__v16si) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
+ (__v16si) __A,
+ (__v16si) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+ (__v8di) __A,
+ (__v8di) __W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
+ (__v8di) __A,
+ (__v8di) _mm512_setzero_si512 ());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+ __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_movedup_pd (__m512d __A)
+{
+ return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
+ 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_movedup_pd(__A),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_movedup_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8di)(__m512i)(C), (int)(imm), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8di)(__m512i)(C), (int)(imm), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_fixupimm_pd(A, B, C, imm) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8di)(__m512i)(C), (int)(imm), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8di)(__m512i)(C), (int)(imm), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8di)(__m512i)(C), \
+ (int)(imm), (__mmask8)(U), \
+ (int)(R)))
+
+#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8di)(__m512i)(C), \
+ (int)(imm), (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16si)(__m512i)(C), (int)(imm), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16si)(__m512i)(C), (int)(imm), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_fixupimm_ps(A, B, C, imm) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16si)(__m512i)(C), (int)(imm), \
+ (__mmask16)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16si)(__m512i)(C), (int)(imm), \
+ (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
+ ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16si)(__m512i)(C), \
+ (int)(imm), (__mmask16)(U), \
+ (int)(R)))
+
+#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
+ ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16si)(__m512i)(C), \
+ (int)(imm), (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_round_sd(A, B, C, imm, R) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_fixupimm_sd(A, B, C, imm) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
+ ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
+ ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_round_ss(A, B, C, imm, R) \
+ ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
+ ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_fixupimm_ss(A, B, C, imm) \
+ ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
+ ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
+ ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
+ ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getexp_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_getexp_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
+ (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_getexp_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_getexp_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+ (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_getmant_round_sd(A, B, C, D, R) \
+ ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_getmant_sd(A, B, C, D) \
+ ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sd(W, U, A, B, C, D) \
+ ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
+ ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_getmant_sd(U, A, B, C, D) \
+ ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
+ ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_getmant_round_ss(A, B, C, D, R) \
+ ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_getmant_ss(A, B, C, D) \
+ ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_ss(W, U, A, B, C, D) \
+ ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
+ ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_getmant_ss(U, A, B, C, D) \
+ ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
+ ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kmov (__mmask16 __A)
+{
+ return __A;
+}
+
+#define _mm_comi_round_sd(A, B, P, R) \
+ ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
+ (int)(P), (int)(R)))
+
+#define _mm_comi_round_ss(A, B, P, R) \
+ ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
+ (int)(P), (int)(R)))
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_si64(A, R) \
+ ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sll_epi32(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sll_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sll_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sll_epi64(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sll_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sll_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sllv_epi32(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sllv_epi32(__X, __Y),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sllv_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sllv_epi64(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sllv_epi64(__X, __Y),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sllv_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sra_epi32(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sra_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_sra_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_sra_epi64(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sra_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_sra_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srav_epi32(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srav_epi32(__X, __Y),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srav_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srav_epi64(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srav_epi64(__X, __Y),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srav_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srl_epi32(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srl_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srl_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srl_epi64(__m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srl_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srl_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srlv_epi32(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srlv_epi32(__X, __Y),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srlv_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srlv_epi64(__X, __Y),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srlv_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_ternarylogic_epi32(A, B, C, imm) \
+ ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), \
+ (__v16si)(__m512i)(C), (int)(imm), \
+ (__mmask16)-1))
+
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
+ ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), \
+ (__v16si)(__m512i)(C), (int)(imm), \
+ (__mmask16)(U)))
+
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
+ ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), \
+ (__v16si)(__m512i)(C), \
+ (int)(imm), (__mmask16)(U)))
+
+#define _mm512_ternarylogic_epi64(A, B, C, imm) \
+ ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), \
+ (__v8di)(__m512i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
+ ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), \
+ (__v8di)(__m512i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
+ ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), \
+ (__v8di)(__m512i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_i64(A, R) \
+ ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
+#endif
+
+#define _mm_cvt_roundsd_si32(A, R) \
+ ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
+
+#define _mm_cvt_roundsd_i32(A, R) \
+ ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
+
+#define _mm_cvt_roundsd_u32(A, R) \
+ ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
+_mm_cvtsd_u32 (__m128d __A)
+{
+ return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsd_u64(A, R) \
+ ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
+ (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvtsd_u64 (__m128d __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvt_roundss_si32(A, R) \
+ ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
+
+#define _mm_cvt_roundss_i32(A, R) \
+ ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_si64(A, R) \
+ ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
+
+#define _mm_cvt_roundss_i64(A, R) \
+ ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
+#endif
+
+#define _mm_cvt_roundss_u32(A, R) \
+ ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
+_mm_cvtss_u32 (__m128 __A)
+{
+ return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundss_u64(A, R) \
+ ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
+ (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvtss_u64 (__m128 __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_i32(A, R) \
+ ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
+
+#define _mm_cvtt_roundsd_si32(A, R) \
+ ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
+
+static __inline__ int __DEFAULT_FN_ATTRS128
+_mm_cvttsd_i32 (__m128d __A)
+{
+ return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_si64(A, R) \
+ ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
+
+#define _mm_cvtt_roundsd_i64(A, R) \
+ ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
+
+static __inline__ long long __DEFAULT_FN_ATTRS128
+_mm_cvttsd_i64 (__m128d __A)
+{
+ return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsd_u32(A, R) \
+ ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
+_mm_cvttsd_u32 (__m128d __A)
+{
+ return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsd_u64(A, R) \
+ ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
+ (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvttsd_u64 (__m128d __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_i32(A, R) \
+ ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
+
+#define _mm_cvtt_roundss_si32(A, R) \
+ ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
+
+static __inline__ int __DEFAULT_FN_ATTRS128
+_mm_cvttss_i32 (__m128 __A)
+{
+ return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_i64(A, R) \
+ ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
+
+#define _mm_cvtt_roundss_si64(A, R) \
+ ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
+
+static __inline__ long long __DEFAULT_FN_ATTRS128
+_mm_cvttss_i64 (__m128 __A)
+{
+ return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundss_u32(A, R) \
+ ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
+
+static __inline__ unsigned __DEFAULT_FN_ATTRS128
+_mm_cvttss_u32 (__m128 __A)
+{
+ return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundss_u64(A, R) \
+ ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
+ (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvttss_u64 (__m128 __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm512_permute_pd(X, C) \
+ ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
+
+#define _mm512_mask_permute_pd(W, U, X, C) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_permute_pd((X), (C)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_permute_pd(U, X, C) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_permute_pd((X), (C)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_permute_ps(X, C) \
+ ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
+
+#define _mm512_mask_permute_ps(W, U, X, C) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_permute_ps((X), (C)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_permute_ps(U, X, C) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_permute_ps((X), (C)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_permutevar_pd(__m512d __A, __m512i __C)
+{
+ return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutevar_pd(__A, __C),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutevar_pd(__A, __C),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_permutevar_ps(__m512 __A, __m512i __C)
+{
+ return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutevar_ps(__A, __C),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutevar_ps(__A, __C),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline __m512d __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
+ (__v8df)__B);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
+ (__v8df)__A);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
+ __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
+ (__v8df)(__m512d)__I);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
+ __m512d __B)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__U,
+ (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline __m512 __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
+{
+ return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
+ (__v16sf) __B);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
+ (__v16sf)__A);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
+ (__v16sf)(__m512)__I);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
+{
+ return (__m512)__builtin_ia32_selectps_512(__U,
+ (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+
+#define _mm512_cvtt_roundpd_epu32(A, R) \
+ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_undefined_si256(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
+ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
+ ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
+ (__v8si)_mm256_setzero_si256(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_roundscale_round_sd(A, B, imm, R) \
+ ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(imm), \
+ (int)(R)))
+
+#define _mm_roundscale_sd(A, B, imm) \
+ ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(imm), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sd(W, U, A, B, imm) \
+ ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(imm), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
+ ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(I), \
+ (int)(R)))
+
+#define _mm_maskz_roundscale_sd(U, A, B, I) \
+ ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(I), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
+ ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(I), \
+ (int)(R)))
+
+#define _mm_roundscale_round_ss(A, B, imm, R) \
+ ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(imm), \
+ (int)(R)))
+
+#define _mm_roundscale_ss(A, B, imm) \
+ ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(imm), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_ss(W, U, A, B, I) \
+ ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (int)(I), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
+ ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (int)(I), \
+ (int)(R)))
+
+#define _mm_maskz_roundscale_ss(U, A, B, I) \
+ ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(I), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
+ ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(I), \
+ (int)(R)))
+
+#define _mm512_scalef_round_pd(A, B, R) \
+ ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)_mm512_undefined_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
+ ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, R) \
+ ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ps(A, B, R) \
+ ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
+ ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, R) \
+ ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_scalef_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
+ (__v2df)( __B), (__v2df) _mm_setzero_pd(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_scalef_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
+ (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srai_epi32(__m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srai_epi32(__A, __B),
+ (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
+ unsigned int __B) {
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
+ (__v16si)_mm512_srai_epi32(__A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_srai_epi64(__m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srai_epi64(__A, __B),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
+ (__v8di)_mm512_srai_epi64(__A, __B),
+ (__v8di)_mm512_setzero_si512());
+}
+
+#define _mm512_shuffle_f32x4(A, B, imm) \
+ ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(imm)))
+
+#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+#define _mm512_shuffle_f64x2(A, B, imm) \
+ ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(imm)))
+
+#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_shuffle_i32x4(A, B, imm) \
+ ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
+ (__v16si)(__m512i)(W)))
+
+#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_shuffle_i64x2(A, B, imm) \
+ ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), (int)(imm)))
+
+#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
+ (__v8di)(__m512i)(W)))
+
+#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_shuffle_pd(A, B, M) \
+ ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(M)))
+
+#define _mm512_mask_shuffle_pd(W, U, A, B, M) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_shuffle_pd(U, A, B, M) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_shuffle_ps(A, B, M) \
+ ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(M)))
+
+#define _mm512_mask_shuffle_ps(W, U, A, B, M) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_shuffle_ps(U, A, B, M) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+#define _mm_sqrt_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
+ (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_sqrt_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(W), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_sqrt_round_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_broadcast_f32x4(__m128 __A)
+{
+ return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+ 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x4(__A),
+ (__v16sf)__O);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
+ (__v16sf)_mm512_broadcast_f32x4(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_broadcast_f64x4(__m256d __A)
+{
+ return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
+ 0, 1, 2, 3, 0, 1, 2, 3);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+ (__v8df)_mm512_broadcast_f64x4(__A),
+ (__v8df)__O);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
+ (__v8df)_mm512_broadcast_f64x4(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcast_i32x4(__m128i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+ 0, 1, 2, 3, 0, 1, 2, 3,
+ 0, 1, 2, 3, 0, 1, 2, 3);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x4(__A),
+ (__v16si)__O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_broadcast_i32x4(__A),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_broadcast_i64x4(__m256i __A)
+{
+ return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
+ 0, 1, 2, 3, 0, 1, 2, 3);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_broadcast_i64x4(__A),
+ (__v8di)__O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_broadcast_i64x4(__A),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__M,
+ (__v8df) _mm512_broadcastsd_pd(__A),
+ (__v8df) __O);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512(__M,
+ (__v8df) _mm512_broadcastsd_pd(__A),
+ (__v8df) _mm512_setzero_pd());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512(__M,
+ (__v16sf) _mm512_broadcastss_ps(__A),
+ (__v16sf) __O);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512(__M,
+ (__v16sf) _mm512_broadcastss_ps(__A),
+ (__v16sf) _mm512_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+ (__v16qi) _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+ (__v16hi) _mm256_undefined_si256 (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+ (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+ (__v16hi) _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+ (__v16qi) _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+ (__v8si) _mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+ (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+ (__v8si) _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+ (__v8hi) _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+ (__v16qi) _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+ (__v16hi) _mm256_undefined_si256 (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+ (__v16hi) __O,
+ __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+ (__v16hi) _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+ (__v16qi) _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+ (__v8si) _mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+ (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+ (__v8si) _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+ (__v8hi) _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+ (__v16qi) _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+ (__v16hi) _mm256_undefined_si256 (),
+ (__mmask16) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+ (__v16hi) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+ (__v16hi) _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+ (__v16qi) _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+ (__v8si) _mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+ (__v8si) __O, __M);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+ (__v8si) _mm256_setzero_si256 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+ (__v8hi) _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+#define _mm512_extracti32x4_epi32(A, imm) \
+ ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v4si)_mm_undefined_si128(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v4si)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
+ (__v4si)_mm_setzero_si128(), \
+ (__mmask8)(U)))
+
+#define _mm512_extracti64x4_epi64(A, imm) \
+ ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+ (__v4di)_mm256_undefined_si256(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
+ ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+ (__v4di)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
+ ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
+ (__v4di)_mm256_setzero_si256(), \
+ (__mmask8)(U)))
+
+#define _mm512_insertf64x4(A, B, imm) \
+ ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
+ (__v4df)(__m256d)(B), (int)(imm)))
+
+#define _mm512_mask_insertf64x4(W, U, A, B, imm) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_insertf64x4(U, A, B, imm) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_inserti64x4(A, B, imm) \
+ ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
+ (__v4di)(__m256i)(B), (int)(imm)))
+
+#define _mm512_mask_inserti64x4(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+ (__v8di)(__m512i)(W)))
+
+#define _mm512_maskz_inserti64x4(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
+ (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_insertf32x4(A, B, imm) \
+ ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
+ (__v4sf)(__m128)(B), (int)(imm)))
+
+#define _mm512_mask_insertf32x4(W, U, A, B, imm) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+ (__v16sf)(__m512)(W)))
+
+#define _mm512_maskz_insertf32x4(U, A, B, imm) \
+ ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
+ (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
+ (__v16sf)_mm512_setzero_ps()))
+
+#define _mm512_inserti32x4(A, B, imm) \
+ ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
+ (__v4si)(__m128i)(B), (int)(imm)))
+
+#define _mm512_mask_inserti32x4(W, U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+ (__v16si)(__m512i)(W)))
+
+#define _mm512_maskz_inserti32x4(U, A, B, imm) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
+ (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_getmant_round_pd(A, B, C, R) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)_mm512_undefined_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_getmant_pd(A, B, C) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_pd(W, U, A, B, C) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_pd(U, A, B, C) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ps(A, B, C, R) \
+ ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
+ ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
+ ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_getmant_ps(A, B, C) \
+ ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+ (int)(((C)<<2)|(B)), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ps(W, U, A, B, C) \
+ ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+ (int)(((C)<<2)|(B)), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_ps(U, A, B, C) \
+ ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
+ (int)(((C)<<2)|(B)), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getexp_round_pd(A, R) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_undefined_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_getexp_round_pd(W, U, A, R) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_getexp_round_pd(U, A, R) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_getexp_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df) _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df) _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ps(A, R) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_getexp_round_ps(W, U, A, R) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_getexp_round_ps(U, A, R) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_getexp_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf) _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_i64gather_ps(index, addr, scale) \
+ ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), (__mmask8)-1, \
+ (int)(scale)))
+
+#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
+ ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm512_i64gather_epi32(index, addr, scale) \
+ ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), \
+ (__mmask8)-1, (int)(scale)))
+
+#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
+ ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm512_i64gather_pd(index, addr, scale) \
+ ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), (__mmask8)-1, \
+ (int)(scale)))
+
+#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
+ ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm512_i64gather_epi64(index, addr, scale) \
+ ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), (__mmask8)-1, \
+ (int)(scale)))
+
+#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
+ ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
+ (void const *)(addr), \
+ (__v8di)(__m512i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm512_i32gather_ps(index, addr, scale) \
+ ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
+ (void const *)(addr), \
+ (__v16si)(__m512)(index), \
+ (__mmask16)-1, (int)(scale)))
+
+#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
+ ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
+ (void const *)(addr), \
+ (__v16si)(__m512)(index), \
+ (__mmask16)(mask), (int)(scale)))
+
+#define _mm512_i32gather_epi32(index, addr, scale) \
+ ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
+ (void const *)(addr), \
+ (__v16si)(__m512i)(index), \
+ (__mmask16)-1, (int)(scale)))
+
+#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
+ ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
+ (void const *)(addr), \
+ (__v16si)(__m512i)(index), \
+ (__mmask16)(mask), (int)(scale)))
+
+#define _mm512_i32gather_pd(index, addr, scale) \
+ ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
+ (void const *)(addr), \
+ (__v8si)(__m256i)(index), (__mmask8)-1, \
+ (int)(scale)))
+
+#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
+ ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
+ (void const *)(addr), \
+ (__v8si)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm512_i32gather_epi64(index, addr, scale) \
+ ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
+ (void const *)(addr), \
+ (__v8si)(__m256i)(index), (__mmask8)-1, \
+ (int)(scale)))
+
+#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
+ ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
+ (void const *)(addr), \
+ (__v8si)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm512_i64scatter_ps(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
+ (__v8di)(__m512i)(index), \
+ (__v8sf)(__m256)(v1), (int)(scale))
+
+#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
+ (__v8di)(__m512i)(index), \
+ (__v8sf)(__m256)(v1), (int)(scale))
+
+#define _mm512_i64scatter_epi32(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
+ (__v8di)(__m512i)(index), \
+ (__v8si)(__m256i)(v1), (int)(scale))
+
+#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
+ (__v8di)(__m512i)(index), \
+ (__v8si)(__m256i)(v1), (int)(scale))
+
+#define _mm512_i64scatter_pd(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
+ (__v8di)(__m512i)(index), \
+ (__v8df)(__m512d)(v1), (int)(scale))
+
+#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
+ (__v8di)(__m512i)(index), \
+ (__v8df)(__m512d)(v1), (int)(scale))
+
+#define _mm512_i64scatter_epi64(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
+ (__v8di)(__m512i)(index), \
+ (__v8di)(__m512i)(v1), (int)(scale))
+
+#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
+ (__v8di)(__m512i)(index), \
+ (__v8di)(__m512i)(v1), (int)(scale))
+
+#define _mm512_i32scatter_ps(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
+ (__v16si)(__m512i)(index), \
+ (__v16sf)(__m512)(v1), (int)(scale))
+
+#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
+ (__v16si)(__m512i)(index), \
+ (__v16sf)(__m512)(v1), (int)(scale))
+
+#define _mm512_i32scatter_epi32(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
+ (__v16si)(__m512i)(index), \
+ (__v16si)(__m512i)(v1), (int)(scale))
+
+#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
+ (__v16si)(__m512i)(index), \
+ (__v16si)(__m512i)(v1), (int)(scale))
+
+#define _mm512_i32scatter_pd(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
+ (__v8si)(__m256i)(index), \
+ (__v8df)(__m512d)(v1), (int)(scale))
+
+#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
+ (__v8si)(__m256i)(index), \
+ (__v8df)(__m512d)(v1), (int)(scale))
+
+#define _mm512_i32scatter_epi64(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
+ (__v8si)(__m256i)(index), \
+ (__v8di)(__m512i)(v1), (int)(scale))
+
+#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
+ (__v8si)(__m256i)(index), \
+ (__v8di)(__m512i)(v1), (int)(scale))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+ (__v4sf)__A,
+ (__v4sf)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmadd_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+ (__v4sf)__B,
+ (__v4sf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(C), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
+ (__v4sf)__X,
+ (__v4sf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+ (__v4sf)__A,
+ -(__v4sf)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmsub_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ -(__v4sf)(__m128)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+ (__v4sf)__B,
+ -(__v4sf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ -(__v4sf)(__m128)(C), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
+ (__v4sf)__X,
+ (__v4sf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
+ ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+ (__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+ -(__v4sf)__A,
+ (__v4sf)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmadd_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+ -(__v4sf)__B,
+ (__v4sf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), \
+ (__v4sf)(__m128)(C), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
+ -(__v4sf)__X,
+ (__v4sf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
+ -(__v4sf)__A,
+ -(__v4sf)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmsub_round_ss(A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), \
+ -(__v4sf)(__m128)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
+ -(__v4sf)__B,
+ -(__v4sf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
+ ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
+ -(__v4sf)(__m128)(B), \
+ -(__v4sf)(__m128)(C), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
+ -(__v4sf)__X,
+ (__v4sf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
+ ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
+ -(__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+ (__v2df)__A,
+ (__v2df)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmadd_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+ (__v2df)__B,
+ (__v2df)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(C), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
+ (__v2df)__X,
+ (__v2df)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+ (__v2df)__A,
+ -(__v2df)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmsub_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ -(__v2df)(__m128d)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+ (__v2df)__B,
+ -(__v2df)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ -(__v2df)(__m128d)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
+ (__v2df)__X,
+ (__v2df)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
+ ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+ (__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+ -(__v2df)__A,
+ (__v2df)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmadd_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+ -(__v2df)__B,
+ (__v2df)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), \
+ (__v2df)(__m128d)(C), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
+ -(__v2df)__X,
+ (__v2df)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
+ -(__v2df)__A,
+ -(__v2df)__B,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmsub_round_sd(A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), \
+ -(__v2df)(__m128d)(C), (__mmask8)-1, \
+ (int)(R)))
+
+#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
+ -(__v2df)__B,
+ -(__v2df)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
+ ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
+ -(__v2df)(__m128d)(B), \
+ -(__v2df)(__m128d)(C), \
+ (__mmask8)(U), \
+ (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
+{
+ return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
+ -(__v2df)__X,
+ (__v2df)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
+ ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
+ -(__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_permutex_pd(X, C) \
+ ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
+
+#define _mm512_mask_permutex_pd(W, U, X, C) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_permutex_pd((X), (C)), \
+ (__v8df)(__m512d)(W)))
+
+#define _mm512_maskz_permutex_pd(U, X, C) \
+ ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
+ (__v8df)_mm512_permutex_pd((X), (C)), \
+ (__v8df)_mm512_setzero_pd()))
+
+#define _mm512_permutex_epi64(X, C) \
+ ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
+
+#define _mm512_mask_permutex_epi64(W, U, X, C) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_permutex_epi64((X), (C)), \
+ (__v8di)(__m512i)(W)))
+
+#define _mm512_maskz_permutex_epi64(U, X, C) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_permutex_epi64((X), (C)), \
+ (__v8di)_mm512_setzero_si512()))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+ return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutexvar_pd(__X, __Y),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_permutexvar_pd(__X, __Y),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_permutexvar_epi64(__X, __Y),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
+ (__v8di)_mm512_permutexvar_epi64(__X, __Y),
+ (__v8di)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+ return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutexvar_ps(__X, __Y),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_permutexvar_ps(__X, __Y),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
+}
+
+#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_permutexvar_epi32(__X, __Y),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
+ (__v16si)_mm512_permutexvar_epi32(__X, __Y),
+ (__v16si)__W);
+}
+
+#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+ return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+ return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
+ *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
+ return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+#define _kand_mask16 _mm512_kand
+#define _kandn_mask16 _mm512_kandn
+#define _knot_mask16 _mm512_knot
+#define _kor_mask16 _mm512_kor
+#define _kxnor_mask16 _mm512_kxnor
+#define _kxor_mask16 _mm512_kxor
+
+#define _kshiftli_mask16(A, I) \
+ ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
+
+#define _kshiftri_mask16(A, I) \
+ ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_cvtmask16_u32(__mmask16 __A) {
+ return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_cvtu32_mask16(unsigned int __A) {
+ return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS
+_load_mask16(__mmask16 *__A) {
+ return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_store_mask16(__mmask16 *__A, __mmask16 __B) {
+ *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_stream_si512 (void * __P, __m512i __A)
+{
+ typedef __v8di __v8di_aligned __attribute__((aligned(64)));
+ __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_stream_load_si512 (void const *__P)
+{
+ typedef __v8di __v8di_aligned __attribute__((aligned(64)));
+ return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_stream_pd (void *__P, __m512d __A)
+{
+ typedef __v8df __v8df_aligned __attribute__((aligned(64)));
+ __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_stream_ps (void *__P, __m512 __A)
+{
+ typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
+ __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+#define _mm_cmp_round_ss_mask(X, Y, P, R) \
+ ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
+ ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_cmp_ss_mask(X, Y, P) \
+ ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P) \
+ ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ (__mmask8)(M), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_round_sd_mask(X, Y, P, R) \
+ ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
+ ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_cmp_sd_mask(X, Y, P) \
+ ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P) \
+ ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P), \
+ (__mmask8)(M), \
+ _MM_FROUND_CUR_DIRECTION))
+
+/* Bit Test */
+
+static __inline __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_test_epi32_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_test_epi64_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+ return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
+ _mm512_setzero_si512());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_movehdup_ps (__m512 __A)
+{
+ return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+ 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_movehdup_ps(__A),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_movehdup_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_moveldup_ps (__m512 __A)
+{
+ return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
+ 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_moveldup_ps(__A),
+ (__v16sf)__W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_moveldup_ps(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
+ _mm_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
+ _mm_setzero_pd());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
+{
+ __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
+ (__v4sf)_mm_setzero_ps(),
+ 0, 4, 4, 4);
+
+ return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_load_ss (__mmask8 __U, const float* __A)
+{
+ return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
+ (__v4sf) _mm_setzero_ps(),
+ __U & 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
+{
+ __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
+ (__v2df)_mm_setzero_pd(),
+ 0, 2);
+
+ return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_load_sd (__mmask8 __U, const double* __A)
+{
+ return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
+ (__v2df) _mm_setzero_pd(),
+ __U & 1);
+}
+
+#define _mm512_shuffle_epi32(A, I) \
+ ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
+
+#define _mm512_mask_shuffle_epi32(W, U, A, I) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shuffle_epi32((A), (I)), \
+ (__v16si)(__m512i)(W)))
+
+#define _mm512_maskz_shuffle_epi32(U, A, I) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shuffle_epi32((A), (I)), \
+ (__v16si)_mm512_setzero_si512()))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+ (__v8df) _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+ (__v8di) _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
+ (__v8df) _mm512_setzero_pd(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
+ (__v8di) _mm512_setzero_si512(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
+ (__v16sf) _mm512_setzero_ps(),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
+ (__v16si) _mm512_setzero_si512(),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+ (__v16sf) _mm512_setzero_ps(),
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+ (__v16si) _mm512_setzero_si512(),
+ (__mmask16) __U);
+}
+
+#define _mm512_cvt_roundps_pd(A, R) \
+ ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+ (__v8df)_mm512_undefined_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
+ ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundps_pd(U, A, R) \
+ ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtps_pd (__m256 __A)
+{
+ return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_cvtps_pd(__A),
+ (__v8df)__W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+ return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
+ (__v8df)_mm512_cvtps_pd(__A),
+ (__v8df)_mm512_setzero_pd());
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_cvtpslo_pd (__m512 __A)
+{
+ return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
+{
+ return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+ (__v8df) __A,
+ (__v8df) __W);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
+ (__v8df) __A,
+ (__v8df) _mm512_setzero_pd ());
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+ (__v16sf) __A,
+ (__v16sf) __W);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
+ (__v16sf) __A,
+ (__v16sf) _mm512_setzero_ps ());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+ __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+ __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+ __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+ __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+ (__mmask16) __U);
+}
+
+#define _mm_cvt_roundsd_ss(A, B, R) \
+ ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v4sf)_mm_undefined_ps(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
+{
+ return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
+ (__v2df)__B,
+ (__v4sf)__W,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
+{
+ return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
+ (__v2df)__B,
+ (__v4sf)_mm_setzero_ps(),
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvtss_i32 _mm_cvtss_si32
+#define _mm_cvtsd_i32 _mm_cvtsd_si32
+#define _mm_cvti32_sd _mm_cvtsi32_sd
+#define _mm_cvti32_ss _mm_cvtsi32_ss
+#ifdef __x86_64__
+#define _mm_cvtss_i64 _mm_cvtss_si64
+#define _mm_cvtsd_i64 _mm_cvtsd_si64
+#define _mm_cvti64_sd _mm_cvtsi64_sd
+#define _mm_cvti64_ss _mm_cvtsi64_ss
+#endif
+
+#ifdef __x86_64__
+#define _mm_cvt_roundi64_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+ (int)(R)))
+
+#define _mm_cvt_roundsi64_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
+ (int)(R)))
+#endif
+
+#define _mm_cvt_roundsi32_ss(A, B, R) \
+ ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
+
+#define _mm_cvt_roundi32_ss(A, B, R) \
+ ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsi64_ss(A, B, R) \
+ ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+ (int)(R)))
+
+#define _mm_cvt_roundi64_ss(A, B, R) \
+ ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
+ (int)(R)))
+#endif
+
+#define _mm_cvt_roundss_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v2df)_mm_undefined_pd(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
+{
+ return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
+ (__v4sf)__B,
+ (__v2df)__W,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
+{
+ return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
+ (__v4sf)__B,
+ (__v2df)_mm_setzero_pd(),
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+ __A[0] = __B;
+ return __A;
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
+ (unsigned long long)(B), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+ __A[0] = __B;
+ return __A;
+}
+#endif
+
+#define _mm_cvt_roundu32_ss(A, B, R) \
+ ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
+ (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+ __A[0] = __B;
+ return __A;
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_ss(A, B, R) \
+ ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
+ (unsigned long long)(B), (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+ __A[0] = __B;
+ return __A;
+}
+#endif
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+ return (__m512i) __builtin_ia32_selectd_512(__M,
+ (__v16si) _mm512_set1_epi32(__A),
+ (__v16si) __O);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+ return (__m512i) __builtin_ia32_selectq_512(__M,
+ (__v8di) _mm512_set1_epi64(__A),
+ (__v8di) __O);
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
+ char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
+ char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
+ char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
+ char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
+ char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
+ char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
+ char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
+ char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
+ char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
+ char __e4, char __e3, char __e2, char __e1, char __e0) {
+
+ return __extension__ (__m512i)(__v64qi)
+ {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
+ __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
+ __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
+ __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
+ __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
+ __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
+ __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
+ __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
+ short __e27, short __e26, short __e25, short __e24, short __e23,
+ short __e22, short __e21, short __e20, short __e19, short __e18,
+ short __e17, short __e16, short __e15, short __e14, short __e13,
+ short __e12, short __e11, short __e10, short __e9, short __e8,
+ short __e7, short __e6, short __e5, short __e4, short __e3,
+ short __e2, short __e1, short __e0) {
+ return __extension__ (__m512i)(__v32hi)
+ {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
+ __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
+ __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
+ __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
+}
+
+static __inline __m512i __DEFAULT_FN_ATTRS512
+_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+ int __E, int __F, int __G, int __H,
+ int __I, int __J, int __K, int __L,
+ int __M, int __N, int __O, int __P)
+{
+ return __extension__ (__m512i)(__v16si)
+ { __P, __O, __N, __M, __L, __K, __J, __I,
+ __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \
+ e8,e9,e10,e11,e12,e13,e14,e15) \
+ _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
+ (e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_set_epi64 (long long __A, long long __B, long long __C,
+ long long __D, long long __E, long long __F,
+ long long __G, long long __H)
+{
+ return __extension__ (__m512i) (__v8di)
+ { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \
+ _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_set_pd (double __A, double __B, double __C, double __D,
+ double __E, double __F, double __G, double __H)
+{
+ return __extension__ (__m512d)
+ { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \
+ _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_set_ps (float __A, float __B, float __C, float __D,
+ float __E, float __F, float __G, float __H,
+ float __I, float __J, float __K, float __L,
+ float __M, float __N, float __O, float __P)
+{
+ return __extension__ (__m512)
+ { __P, __O, __N, __M, __L, __K, __J, __I,
+ __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
+ _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
+ (e4),(e3),(e2),(e1),(e0))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_abs_ps(__m512 __A)
+{
+ return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
+{
+ return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_abs_pd(__m512d __A)
+{
+ return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
+{
+ return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
+}
+
+/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
+ * outputs. This class of vector operation forms the basis of many scientific
+ * computations. In vector-reduction arithmetic, the evaluation order is
+ * independent of the order of the input elements of V.
+
+ * For floating-point intrinsics:
+ * 1. When using fadd/fmul intrinsics, the order of operations within the
+ * vector is unspecified (associative math).
+ * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
+ * produce unspecified results.
+
+ * Used bisection method. At each step, we partition the vector with previous
+ * step in half, and the operation is performed on its two halves.
+ * This takes log2(n) steps where n is the number of elements in the vector.
+ */
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v8di)__W);
+#else
+ return __builtin_ia32_reduce_add_q512(__W);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v8di)__W);
+#else
+ return __builtin_ia32_reduce_mul_q512(__W);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_and_q512(__W);
+#else
+ return __builtin_reduce_and((__v8di)__W);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_or_q512(__W);
+#else
+ return __builtin_reduce_or((__v8di)__W);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
+ __W = _mm512_maskz_mov_epi64(__M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v8di)__W);
+#else
+ return __builtin_ia32_reduce_add_q512(__W);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
+ __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v8di)__W);
+#else
+ return __builtin_ia32_reduce_mul_q512(__W);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
+ __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_and_q512(__W);
+#else
+ return __builtin_reduce_and((__v8di)__W);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
+ __W = _mm512_maskz_mov_epi64(__M, __W);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_or_q512(__W);
+#else
+ return __builtin_reduce_or((__v8di)__W);
+#endif
+}
+
+// -0.0 is used to ignore the start value since it is the neutral value of
+// floating point addition. For more information, please refer to
+// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
+static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
+ return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
+ return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
+ __W = _mm512_maskz_mov_pd(__M, __W);
+ return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
+ __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
+ return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_add_epi32(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v16si)__W);
+#else
+ return __builtin_ia32_reduce_add_d512((__v16si)__W);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_mul_epi32(__m512i __W) {
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v16si)__W);
+#else
+ return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_and_epi32(__m512i __W) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_and_d512((__v16si)__W);
+#else
+ return __builtin_reduce_and((__v16si)__W);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_or_epi32(__m512i __W) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_or_d512((__v16si)__W);
+#else
+ return __builtin_reduce_or((__v16si)__W);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
+ __W = _mm512_maskz_mov_epi32(__M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_add((__v16si)__W);
+#else
+ return __builtin_ia32_reduce_add_d512((__v16si)__W);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
+ __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
+#if (__clang_major__ > 14)
+ return __builtin_reduce_mul((__v16si)__W);
+#else
+ return __builtin_ia32_reduce_mul_d512((__v16si)__W);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
+ __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_and_d512((__v16si)__W);
+#else
+ return __builtin_reduce_and((__v16si)__W);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
+ __W = _mm512_maskz_mov_epi32(__M, __W);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_or_d512((__v16si)__W);
+#else
+ return __builtin_reduce_or((__v16si)__W);
+#endif
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_add_ps(__m512 __W) {
+ return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_mul_ps(__m512 __W) {
+ return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
+ __W = _mm512_maskz_mov_ps(__M, __W);
+ return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
+ __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
+ return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epi64(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smax_q512(__V);
+#else
+ return __builtin_reduce_max((__v8di)__V);
+#endif
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epu64(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umax_q512(__V);
+#else
+ return __builtin_reduce_max((__v8du)__V);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epi64(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smin_q512(__V);
+#else
+ return __builtin_reduce_min((__v8di)__V);
+#endif
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epu64(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umin_q512(__V);
+#else
+ return __builtin_reduce_min((__v8du)__V);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
+ __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smax_q512(__V);
+#else
+ return __builtin_reduce_max((__v8di)__V);
+#endif
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
+ __V = _mm512_maskz_mov_epi64(__M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umax_q512(__V);
+#else
+ return __builtin_reduce_max((__v8du)__V);
+#endif
+}
+
+static __inline__ long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
+ __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smin_q512(__V);
+#else
+ return __builtin_reduce_min((__v8di)__V);
+#endif
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
+ __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umin_q512(__V);
+#else
+ return __builtin_reduce_min((__v8du)__V);
+#endif
+}
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epi32(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smax_d512((__v16si)__V);
+#else
+ return __builtin_reduce_max((__v16si)__V);
+#endif
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_epu32(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umax_d512((__v16si)__V);
+#else
+ return __builtin_reduce_max((__v16su)__V);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epi32(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smin_d512((__v16si)__V);
+#else
+ return __builtin_reduce_min((__v16si)__V);
+#endif
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_epu32(__m512i __V) {
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umin_d512((__v16si)__V);
+#else
+ return __builtin_reduce_min((__v16su)__V);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
+ __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smax_d512((__v16si)__V);
+#else
+ return __builtin_reduce_max((__v16si)__V);
+#endif
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
+ __V = _mm512_maskz_mov_epi32(__M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umax_d512((__v16si)__V);
+#else
+ return __builtin_reduce_max((__v16su)__V);
+#endif
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
+ __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_smin_d512((__v16si)__V);
+#else
+ return __builtin_reduce_min((__v16si)__V);
+#endif
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
+ __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
+#if (__clang_major__ < 14)
+ return __builtin_ia32_reduce_umin_d512((__v16si)__V);
+#else
+ return __builtin_reduce_min((__v16su)__V);
+#endif
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_pd(__m512d __V) {
+ return __builtin_ia32_reduce_fmax_pd512(__V);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_pd(__m512d __V) {
+ return __builtin_ia32_reduce_fmin_pd512(__V);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
+ __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
+ return __builtin_ia32_reduce_fmax_pd512(__V);
+}
+
+static __inline__ double __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
+ __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
+ return __builtin_ia32_reduce_fmin_pd512(__V);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_ps(__m512 __V) {
+ return __builtin_ia32_reduce_fmax_ps512(__V);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_ps(__m512 __V) {
+ return __builtin_ia32_reduce_fmin_ps512(__V);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
+ __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
+ return __builtin_ia32_reduce_fmax_ps512(__V);
+}
+
+static __inline__ float __DEFAULT_FN_ATTRS512
+_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
+ __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
+ return __builtin_ia32_reduce_fmin_ps512(__V);
+}
+
+/// Moves the least significant 32 bits of a vector of [16 x i32] to a
+/// 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __A
+/// A vector of [16 x i32]. The least significant 32 bits are moved to the
+/// destination.
+/// \returns A 32-bit signed integer containing the moved value.
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_cvtsi512_si32(__m512i __A) {
+ __v16si __b = (__v16si)__A;
+ return __b[0];
+}
+
+/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
+/// locations starting at location \a base_addr at packed 32-bit integer indices
+/// stored in the lower half of \a vindex scaled by \a scale them in dst.
+///
+/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// dst[i+63:i] := MEM[addr+63:addr]
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_i32logather_pd(vindex, base_addr, scale) \
+ _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
+
+/// Loads 8 double-precision (64-bit) floating-point elements from memory
+/// starting at location \a base_addr at packed 32-bit integer indices stored in
+/// the lower half of \a vindex scaled by \a scale into dst using writemask
+/// \a mask (elements are copied from \a src when the corresponding mask bit is
+/// not set).
+///
+/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// IF mask[j]
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// dst[i+63:i] := MEM[addr+63:addr]
+/// ELSE
+/// dst[i+63:i] := src[i+63:i]
+/// FI
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \
+ _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \
+ (base_addr), (scale))
+
+/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
+/// at packed 32-bit integer indices stored in the lower half of \a vindex
+/// scaled by \a scale and stores them in dst.
+///
+/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// dst[i+63:i] := MEM[addr+63:addr]
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_i32logather_epi64(vindex, base_addr, scale) \
+ _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
+
+/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
+/// at packed 32-bit integer indices stored in the lower half of \a vindex
+/// scaled by \a scale and stores them in dst using writemask \a mask (elements
+/// are copied from \a src when the corresponding mask bit is not set).
+///
+/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// IF mask[j]
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// dst[i+63:i] := MEM[addr+63:addr]
+/// ELSE
+/// dst[i+63:i] := src[i+63:i]
+/// FI
+/// ENDFOR
+/// dst[MAX:512] := 0
+/// \endoperation
+#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \
+ _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \
+ (base_addr), (scale))
+
+/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
+/// and to memory locations starting at location \a base_addr at packed 32-bit
+/// integer indices stored in \a vindex scaled by \a scale.
+///
+/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// MEM[addr+63:addr] := v1[i+63:i]
+/// ENDFOR
+/// \endoperation
+#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \
+ _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
+/// to memory locations starting at location \a base_addr at packed 32-bit
+/// integer indices stored in \a vindex scaled by \a scale. Only those elements
+/// whose corresponding mask bit is set in writemask \a mask are written to
+/// memory.
+///
+/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// IF mask[j]
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// MEM[addr+63:addr] := a[i+63:i]
+/// FI
+/// ENDFOR
+/// \endoperation
+#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \
+ _mm512_mask_i32scatter_pd((base_addr), (mask), \
+ _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
+/// memory locations starting at location \a base_addr at packed 32-bit integer
+/// indices stored in \a vindex scaled by \a scale.
+///
+/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// MEM[addr+63:addr] := a[i+63:i]
+/// ENDFOR
+/// \endoperation
+#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \
+ _mm512_i32scatter_epi64((base_addr), \
+ _mm512_castsi512_si256(vindex), (v1), (scale))
+
+/// Stores 8 packed 64-bit integer elements located in a and stores them in
+/// memory locations starting at location \a base_addr at packed 32-bit integer
+/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
+/// whose corresponding mask bit is not set are not written to memory).
+///
+/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// i := j*64
+/// m := j*32
+/// IF mask[j]
+/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
+/// MEM[addr+63:addr] := a[i+63:i]
+/// FI
+/// ENDFOR
+/// \endoperation
+#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \
+ _mm512_mask_i32scatter_epi64((base_addr), (mask), \
+ _mm512_castsi512_si256(vindex), (v1), (scale))
+
+#undef __DEFAULT_FN_ATTRS512
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __AVX512FINTRIN_H */
--- /dev/null
+/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16INTRIN_H
+#define __AVX512FP16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
+typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
+typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
+typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
+ __min_vector_width__(512)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
+ __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
+ __min_vector_width__(128)))
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
+ return __a[0];
+}
+
+static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
+ return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
+ return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
+ return (__m256h)__builtin_ia32_undef256();
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
+ return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
+ return (__m128h)__builtin_ia32_undef128();
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
+ return (__m512h)__builtin_ia32_undef512();
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
+ return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512
+_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+ _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+ _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+ _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
+ _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
+ _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
+ _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
+ _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
+ return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
+ __h25, __h24, __h23, __h22, __h21, __h20, __h19,
+ __h18, __h17, __h16, __h15, __h14, __h13, __h12,
+ __h11, __h10, __h9, __h8, __h7, __h6, __h5,
+ __h4, __h3, __h2, __h1};
+}
+
+#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
+ h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
+ h25, h26, h27, h28, h29, h30, h31, h32) \
+ _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
+ (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
+ (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
+ (h5), (h4), (h3), (h2), (h1))
+
+static __inline __m512h __DEFAULT_FN_ATTRS512
+_mm512_set1_pch(_Float16 _Complex h) {
+ return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
+ return (__m128)__a;
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
+ return (__m256)__a;
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
+ return (__m512)__a;
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
+ return (__m128d)__a;
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
+ return (__m256d)__a;
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
+ return (__m512d)__a;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
+ return (__m128i)__a;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_castph_si256(__m256h __a) {
+ return (__m256i)__a;
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_castph_si512(__m512h __a) {
+ return (__m512i)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
+ return (__m128h)__a;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
+ return (__m256h)__a;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
+ return (__m512h)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
+ return (__m128h)__a;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
+ return (__m256h)__a;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
+ return (__m512h)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
+ return (__m128h)__a;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_castsi256_ph(__m256i __a) {
+ return (__m256h)__a;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_castsi512_ph(__m512i __a) {
+ return (__m512h)__a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_castph256_ph128(__m256h __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_castph512_ph128(__m512h __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_castph512_ph256(__m512h __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_castph128_ph256(__m128h __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+ -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_castph128_ph512(__m128h __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_castph256_ph512(__m256h __a) {
+ return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1, -1, -1);
+}
+
+/// Constructs a 256-bit floating-point vector of [16 x half] from a
+/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
+/// contain the value of the source vector. The upper 384 bits are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x half].
+/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
+/// contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_zextph128_ph256(__m128h __a) {
+ return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
+ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// Constructs a 512-bit floating-point vector of [32 x half] from a
+/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
+/// contain the value of the source vector. The upper 384 bits are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x half].
+/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
+/// contain the value of the parameter. The upper 384 bits are set to zero.
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_zextph128_ph512(__m128h __a) {
+ return __builtin_shufflevector(
+ __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+/// Constructs a 512-bit floating-point vector of [32 x half] from a
+/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
+/// contain the value of the source vector. The upper 256 bits are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit vector of [16 x half].
+/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
+/// contain the value of the parameter. The upper 256 bits are set to zero.
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_zextph256_ph512(__m256h __a) {
+ return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31);
+}
+
+#define _mm_comi_round_sh(A, B, P, R) \
+ __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
+
+#define _mm_comi_sh(A, B, pred) \
+ _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
+ __m128h B) {
+ return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)((__v32hf)__A + (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+ (__v32hf)_mm512_add_ph(__A, __B),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_add_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_add_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_add_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)((__v32hf)__A - (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+ (__v32hf)_mm512_sub_ph(__A, __B),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_sub_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_sub_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)((__v32hf)__A * (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+ (__v32hf)_mm512_mul_ph(__A, __B),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_mul_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_mul_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)((__v32hf)__A / (__v32hf)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+ (__v32hf)_mm512_div_ph(__A, __B),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_div_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_div_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_div_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+ (__v32hf)_mm512_min_ph(__A, __B),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_min_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_min_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_min_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
+ (__v32hf)_mm512_max_ph(__A, __B),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm512_max_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(R)))
+
+#define _mm512_mask_max_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_max_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
+ return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
+ return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectps_512(
+ (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
+ (__v16sf)_mm512_conj_pch(__A),
+ (__v16sf)_mm512_setzero_ps());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
+ __m128h __B) {
+ __A[0] += __B[0];
+ return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_add_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_add_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_add_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_addsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_add_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_addsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_add_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_addsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
+ __m128h __B) {
+ __A[0] -= __B[0];
+ return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_sub_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_sub_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_sub_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_subsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_sub_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_subsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_sub_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_subsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
+ __m128h __B) {
+ __A[0] *= __B[0];
+ return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_mul_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_mul_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_mul_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_mulsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_mul_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_mulsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_mul_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_mulsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
+ __m128h __B) {
+ __A[0] /= __B[0];
+ return __A;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_div_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ __A = _mm_div_sh(__A, __B);
+ return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
+}
+
+#define _mm_div_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_divsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_div_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_divsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_div_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_divsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_minsh_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_minsh_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_min_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_minsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_min_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_minsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_min_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_minsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_maxsh_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_maxsh_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_max_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_maxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_max_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_maxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_max_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_maxsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_cmp_round_ph_mask(A, B, P, R) \
+ ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(P), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
+ ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(B), (int)(P), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_cmp_ph_mask(A, B, P) \
+ _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
+ _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_cmp_round_sh_mask(X, Y, P, R) \
+ ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), (int)(P), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
+ ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), (int)(P), \
+ (__mmask8)(M), (int)(R)))
+
+#define _mm_cmp_sh_mask(X, Y, P) \
+ ((__mmask8)__builtin_ia32_cmpsh_mask( \
+ (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
+ ((__mmask8)__builtin_ia32_cmpsh_mask( \
+ (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
+ _MM_FROUND_CUR_DIRECTION))
+// loads with vmovsh:
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
+ struct __mm_load_sh_struct {
+ _Float16 __u;
+ } __attribute__((__packed__, __may_alias__));
+ _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
+ return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
+ __m128h src = (__v8hf)__builtin_shufflevector(
+ (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
+
+ return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
+ return (__m128h)__builtin_ia32_loadsh128_mask(
+ (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_load_ph(void const *__p) {
+ return *(const __m512h *)__p;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_load_ph(void const *__p) {
+ return *(const __m256h *)__p;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
+ return *(const __m128h *)__p;
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_loadu_ph(void const *__p) {
+ struct __loadu_ph {
+ __m512h_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_ph *)__p)->__v;
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_loadu_ph(void const *__p) {
+ struct __loadu_ph {
+ __m256h_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_ph *)__p)->__v;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
+ struct __loadu_ph {
+ __m128h_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_ph *)__p)->__v;
+}
+
+// stores with vmovsh:
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
+ __m128h __a) {
+ struct __mm_store_sh_struct {
+ _Float16 __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
+ __mmask8 __U,
+ __m128h __A) {
+ __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
+ __m512h __A) {
+ *(__m512h *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
+ __m256h __A) {
+ *(__m256h *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
+ __m128h __A) {
+ *(__m128h *)__P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
+ __m512h __A) {
+ struct __storeu_ph {
+ __m512h_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_ph *)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
+ __m256h __A) {
+ struct __storeu_ph {
+ __m256h_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_ph *)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
+ __m128h __A) {
+ struct __storeu_ph {
+ __m128h_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_ph *)__P)->__v = __A;
+}
+
+// moves with vmovsh:
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
+ __m128h __b) {
+ __a[0] = __b[0];
+ return __a;
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
+ _mm_setzero_ph());
+}
+
+// vmovw:
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
+ return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
+ __v8hi __b = (__v8hi)__a;
+ return __b[0];
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_rcpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
+ (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rcpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_rsqrtph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
+ (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_rsqrtph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+#define _mm512_getmant_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ph(W, U, A, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_ph(U, A, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_getexpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_getexpph512_mask(
+ (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_getexpph512_mask(
+ (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_getexp_round_ph(A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_getexp_round_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_getexp_round_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_scalefph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__W, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_scalefph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_scalef_round_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_scalefph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
+ (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_scalefph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
+ ((__m512h)__builtin_ia32_scalefph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+#define _mm512_roundscale_ph(A, B) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask( \
+ (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ph(A, B, C, imm) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask( \
+ (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
+ (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ph(A, B, imm) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask( \
+ (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
+ (__v32hf)(__m512h)(A), \
+ (__mmask32)(B), (int)(R)))
+
+#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
+ (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(A), (int)(R)))
+
+#define _mm512_roundscale_round_ph(A, imm, R) \
+ ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_reduce_ph(A, imm) \
+ ((__m512h)__builtin_ia32_reduceph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ph(W, U, A, imm) \
+ ((__m512h)__builtin_ia32_reduceph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ph(U, A, imm) \
+ ((__m512h)__builtin_ia32_reduceph512_mask( \
+ (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
+ ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
+ ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)_mm512_setzero_ph(), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_reduce_round_ph(A, imm, R) \
+ ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rcpsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rcpsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rsqrtsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_rsqrtsh_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+#define _mm_getmant_round_sh(A, B, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_getmant_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_getmant_sh(U, A, B, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
+ (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+#define _mm_getexp_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_getexpsh128_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_getexp_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_scalef_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_scalefsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefsh_round_mask(
+ (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_scalefsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefsh_round_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_scalef_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_scalefsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_roundscale_round_sh(A, B, imm, R) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(imm), (int)(R)))
+
+#define _mm_roundscale_sh(A, B, imm) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sh(W, U, A, B, I) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(I), (int)(R)))
+
+#define _mm_maskz_roundscale_sh(U, A, B, I) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
+ ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(I), (int)(R)))
+
+#define _mm_reduce_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sh(W, U, A, B, C) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sh(U, A, B, C) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(C), (int)(R)))
+
+#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(C), (int)(R)))
+
+#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_reducesh_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(C), (int)(R)))
+
+#define _mm512_sqrt_round_ph(A, R) \
+ ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
+
+#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
+ (__v32hf)(__m512h)(W)))
+
+#define _mm512_maskz_sqrt_round_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_selectph_512( \
+ (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
+ (__v32hf)_mm512_setzero_ph()))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
+ return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)(__U),
+ (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
+ (__v32hf)(__m512h)(__W));
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
+ return (__m512h)__builtin_ia32_selectph_512(
+ (__mmask32)(__U),
+ (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
+ (__v32hf)_mm512_setzero_ph());
+}
+
+#define _mm_sqrt_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+ (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
+ __mmask32 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+ (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
+ (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_sqrtsh_round_mask(
+ (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
+ (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
+ ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
+ (int)(imm), (__mmask32)(U)))
+
+#define _mm512_fpclass_ph_mask(A, imm) \
+ ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
+ (int)(imm), (__mmask32)-1))
+
+#define _mm_fpclass_sh_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_fpclass_sh_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm512_cvt_roundpd_ph(A, R) \
+ ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
+ (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
+ ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
+ ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
+ (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
+ (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
+ (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
+ (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_pd(A, R) \
+ ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
+ (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
+ ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
+ ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
+ (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
+ return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
+ (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
+ return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
+ (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512d __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
+ return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
+ (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_ss(A, B, R) \
+ ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
+ (__v4sf)_mm_undefined_ps(), \
+ (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
+ ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
+ (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
+ ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
+ __m128h __B) {
+ return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
+ (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
+ __mmask8 __U,
+ __m128 __A,
+ __m128h __B) {
+ return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
+ (__v4sf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
+ __m128 __A,
+ __m128h __B) {
+ return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
+ (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundss_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
+ (__v8hf)_mm_undefined_ph(), \
+ (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
+ (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
+ __m128 __B) {
+ return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
+ (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128 __B) {
+ return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
+ (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
+ __m128h __A,
+ __m128 __B) {
+ return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
+ (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsd_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
+ (__v8hf)_mm_undefined_ph(), \
+ (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
+ (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
+ __m128d __B) {
+ return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
+ (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128d __B) {
+ return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
+ (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
+ return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
+ (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
+ (__v2df)_mm_undefined_pd(), \
+ (__mmask8)(-1), (int)(R)))
+
+#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
+ ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
+ (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
+ ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
+ __m128h __B) {
+ return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
+ (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
+ __mmask8 __U,
+ __m128d __A,
+ __m128h __B) {
+ return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
+ (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
+ return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
+ (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epi16(A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
+ (__v32hi)_mm512_undefined_epi32(), \
+ (__mmask32)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
+ (__v32hi)_mm512_setzero_epi32(), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epi16(__m512h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2w512_mask(
+ (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2w512_mask(
+ (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2w512_mask(
+ (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epi16(A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
+ (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
+ (__v32hi)_mm512_setzero_epi32(), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epi16(__m512h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2w512_mask(
+ (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2w512_mask(
+ (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2w512_mask(
+ (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi16_ph(A, R) \
+ ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
+ (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_cvtepi16_ph(__m512i __A) {
+ return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
+ (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
+ return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
+ (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
+ return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
+ (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epu16(A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
+ (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
+ (__v32hu)_mm512_setzero_epi32(), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epu16(__m512h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
+ (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
+ (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
+ (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epu16(A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
+ (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
+ (__v32hu)_mm512_setzero_epi32(), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epu16(__m512h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
+ (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
+ (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
+ (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu16_ph(A, R) \
+ ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
+ (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
+ (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_cvtepu16_ph(__m512i __A) {
+ return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
+ (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
+ return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
+ (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
+ return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
+ (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epi32(A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
+ (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
+ (__v16si)_mm512_setzero_epi32(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epi32(__m256h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
+ (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
+ (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
+ (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epu32(A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
+ (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
+ (__v16su)_mm512_setzero_epi32(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epu32(__m256h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
+ (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
+ (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
+ (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi32_ph(A, R) \
+ ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
+ (__v16hf)_mm256_undefined_ph(), \
+ (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
+ ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
+ ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
+ (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_cvtepi32_ph(__m512i __A) {
+ return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
+ (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
+ return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
+ (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
+ return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
+ (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu32_ph(A, R) \
+ ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
+ (__v16hf)_mm256_undefined_ph(), \
+ (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
+ ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
+ ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
+ (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_cvtepu32_ph(__m512i __A) {
+ return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
+ (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
+ return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
+ (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
+ return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
+ (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epi32(A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
+ (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
+ (__v16si)_mm512_setzero_epi32(), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epi32(__m256h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
+ (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
+ (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
+ (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epu32(A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
+ (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
+ (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
+ (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epu32(__m256h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
+ (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
+ (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
+ (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepi64_ph(A, R) \
+ ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
+ (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
+ ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
+ ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
+ (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_cvtepi64_ph(__m512i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
+ (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
+ (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
+ (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epi64(A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
+ (__v8di)_mm512_undefined_epi32(), \
+ (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
+ (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epi64(__m128h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
+ (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
+ (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
+ (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundepu64_ph(A, R) \
+ ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
+ (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
+
+#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
+ ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
+ ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
+ (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_cvtepu64_ph(__m512i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
+ (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
+ (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
+ (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvt_roundph_epu64(A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
+ (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
+ (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtph_epu64(__m128h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
+ (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
+ (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
+ (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epi64(A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
+ (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
+ (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epi64(__m128h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
+ (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
+ (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
+ (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtt_roundph_epu64(A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
+ (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
+ (int)(R)))
+
+#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
+ (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvttph_epu64(__m128h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
+ (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
+ (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
+ return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
+ (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_i32(A, R) \
+ ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
+ return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_u32(A, R) \
+ ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS128
+_mm_cvtsh_u32(__m128h __A) {
+ return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundsh_i64(A, R) \
+ ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
+
+static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
+ return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_cvt_roundsh_u64(A, R) \
+ ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvtsh_u64(__m128h __A) {
+ return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
+ (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
+}
+#endif // __x86_64__
+
+#define _mm_cvt_roundu32_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
+ __A[0] = __B;
+ return __A;
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundu64_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
+ (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
+ __A[0] = __B;
+ return __A;
+}
+#endif
+
+#define _mm_cvt_roundi32_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
+ int __B) {
+ __A[0] = __B;
+ return __A;
+}
+
+#ifdef __x86_64__
+#define _mm_cvt_roundi64_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
+ long long __B) {
+ __A[0] = __B;
+ return __A;
+}
+#endif
+
+#define _mm_cvtt_roundsh_i32(A, R) \
+ ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
+
+static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
+ return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsh_i64(A, R) \
+ ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
+
+static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
+ return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm_cvtt_roundsh_u32(A, R) \
+ ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS128
+_mm_cvttsh_u32(__m128h __A) {
+ return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __x86_64__
+#define _mm_cvtt_roundsh_u64(A, R) \
+ ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
+_mm_cvttsh_u64(__m128h __A) {
+ return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
+ (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+#define _mm512_cvtx_roundph_ps(A, R) \
+ ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
+ ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
+ ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
+ (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
+ return (__m512)__builtin_ia32_vcvtph2psx512_mask(
+ (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
+ return (__m512)__builtin_ia32_vcvtph2psx512_mask(
+ (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512 __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
+ return (__m512)__builtin_ia32_vcvtph2psx512_mask(
+ (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtx_roundps_ph(A, R) \
+ ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
+ (__v16hf)_mm256_undefined_ph(), \
+ (__mmask16)(-1), (int)(R)))
+
+#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
+ ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
+ ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
+ (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
+ return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
+ (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
+ return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
+ (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
+ return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
+ (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fmsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fnmadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fnmsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ (__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_maskz(
+ -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmaddsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_fmsubadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)-1, (int)(R)))
+
+#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
+ (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
+ (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
+ (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask( \
+ (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
+ -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
+ (__mmask32)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
+ -(__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
+ return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
+ (__v32hf)__C, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmadd_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
+ -(__v8hf)__B, (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
+ -(__v8hf)__B, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmsub_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
+ -(__v8hf)__C, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)R))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
+ (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmadd_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
+ __m128h __A,
+ __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
+ (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
+ (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fnmsub_round_sh(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
+ (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
+ return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
+ (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
+ ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
+ (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
+ (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fcmadd_round_sch(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
+ ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
+ ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
+ (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmadd_round_sch(A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
+ ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
+ ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
+ (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fcmul_round_sch(A, B, R) \
+ ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcsh_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm_fmul_round_sch(A, B, R) \
+ ((__m128h)__builtin_ia32_vfmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
+
+#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_fmul_round_sch(U, A, B, R) \
+ ((__m128h)__builtin_ia32_vfmulcsh_mask( \
+ (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
+ (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_vfcmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__W, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfcmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fcmul_round_pch(A, B, R) \
+ ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
+ __m512h __B) {
+ return (__m512h)__builtin_ia32_vfmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__W, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_vfmulcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmul_round_pch(A, B, R) \
+ ((__m512h)__builtin_ia32_vfmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
+ ((__m512h)__builtin_ia32_vfmulcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
+ (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fcmadd_round_pch(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
+ __m512h __B,
+ __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__C, (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
+ (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
+ return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
+ return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
+ (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_fmadd_round_pch(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)-1, (int)(R)))
+
+#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
+
+#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
+ (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
+ (__mmask16)(U), (int)(R)))
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_add_ph(__m512h __W) {
+ return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_mul_ph(__m512h __W) {
+ return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_max_ph(__m512h __V) {
+ return __builtin_ia32_reduce_fmax_ph512(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS512
+_mm512_reduce_min_ph(__m512h __V) {
+ return __builtin_ia32_reduce_fmin_ph512(__V);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
+ return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
+ (__v32hf)__A);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
+ return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
+ (__v32hi)__B);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
+ return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
+}
+
+// intrinsics below are alias for f*mul_*ch
+#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
+#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
+#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
+#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
+#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
+ _mm512_mask_fmul_round_pch(W, U, A, B, R)
+#define _mm512_maskz_mul_round_pch(U, A, B, R) \
+ _mm512_maskz_fmul_round_pch(U, A, B, R)
+
+#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
+#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
+#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
+#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
+#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
+ _mm512_mask_fcmul_round_pch(W, U, A, B, R)
+#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
+ _mm512_maskz_fcmul_round_pch(U, A, B, R)
+
+#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
+#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
+#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
+#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
+#define _mm_mask_mul_round_sch(W, U, A, B, R) \
+ _mm_mask_fmul_round_sch(W, U, A, B, R)
+#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
+
+#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
+#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
+#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
+#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
+#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
+ _mm_mask_fcmul_round_sch(W, U, A, B, R)
+#define _mm_maskz_cmul_round_sch(U, A, B, R) \
+ _mm_maskz_fcmul_round_sch(U, A, B, R)
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+#undef __DEFAULT_FN_ATTRS512
+
+#endif
--- /dev/null
+/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAINTRIN_H
+#define __IFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
+ (__v8di) __Z);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
+ (__v8di) __Z);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
+ (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__M,
+ (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
+ (__v8di)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __IFMAVLINTRIN_H
+#define __IFMAVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
+
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
+ (__v2di) __Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__M,
+ (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__M,
+ (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+ (__v4di)__Z);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__M,
+ (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__M,
+ (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+ (__v2di)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__M,
+ (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__M,
+ (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+ (__v4di)__Z);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__M,
+ (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__M,
+ (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
+ (__v4di)_mm256_setzero_si256());
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512PFINTRIN_H
+#define __AVX512PFINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
+
+#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
+ __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+ (void const *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
+ __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
+ (void const *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
+ __builtin_ia32_gatherpfdps((__mmask16)(mask), \
+ (__v16si)(__m512i)(index), (void const *)(addr), \
+ (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
+ __builtin_ia32_gatherpfdps((__mmask16) -1, \
+ (__v16si)(__m512i)(index), (void const *)(addr), \
+ (int)(scale), (int)(hint))
+
+#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
+ __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+ (void const *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
+ __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
+ (void const *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
+ __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+ (void const *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
+ __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
+ (void const *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
+ __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
+ (void *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
+ __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
+ (void *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
+ __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
+ (void *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
+ __builtin_ia32_scatterpfdps((__mmask16)(mask), \
+ (__v16si)(__m512i)(index), (void *)(addr), \
+ (int)(scale), (int)(hint))
+
+#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
+ __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
+ (void *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
+ __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
+ (void *)(addr), (int)(scale), \
+ (int)(hint))
+
+#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
+ __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
+ (void *)(addr), (int)(scale), (int)(hint))
+
+#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
+ __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
+ (void *)(addr), (int)(scale), (int)(hint))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VBMI2INTRIN_H
+#define __AVX512VBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+ (__v32hi) __S,
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
+ (__v32hi) _mm512_setzero_si512(),
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+ (__v64qi) __S,
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
+ (__v64qi) _mm512_setzero_si512(),
+ __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
+{
+ __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
+ __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
+{
+ __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+ (__v32hi) __S,
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
+ (__v32hi) _mm512_setzero_si512(),
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+ (__v64qi) __S,
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
+{
+ return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
+ (__v64qi) _mm512_setzero_si512(),
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+ (__v32hi) __S,
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
+ (__v32hi) _mm512_setzero_si512(),
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+ (__v64qi) __S,
+ __U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
+ (__v64qi) _mm512_setzero_si512(),
+ __U);
+}
+
+#define _mm512_shldi_epi64(A, B, I) \
+ ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+ (__v8di)(__m512i)(S)))
+
+#define _mm512_maskz_shldi_epi64(U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
+ (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_shldi_epi32(A, B, I) \
+ ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+ (__v16si)(__m512i)(S)))
+
+#define _mm512_maskz_shldi_epi32(U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
+ (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_shldi_epi16(A, B, I) \
+ ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
+ (__v32hi)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+ (__v32hi)(__m512i)(S)))
+
+#define _mm512_maskz_shldi_epi16(U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+#define _mm512_shrdi_epi64(A, B, I) \
+ ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+ (__v8di)(__m512i)(S)))
+
+#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
+ (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
+ (__v8di)_mm512_setzero_si512()))
+
+#define _mm512_shrdi_epi32(A, B, I) \
+ ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+ (__v16si)(__m512i)(S)))
+
+#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
+ (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
+ (__v16si)_mm512_setzero_si512()))
+
+#define _mm512_shrdi_epi16(A, B, I) \
+ ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
+ (__v32hi)(__m512i)(B), (int)(I)))
+
+#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+ (__v32hi)(__m512i)(S)))
+
+#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
+ (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
+ (__v32hi)_mm512_setzero_si512()))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
+ (__v8di)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_shldv_epi64(__A, __B, __C),
+ (__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_shldv_epi64(__A, __B, __C),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_shldv_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_shldv_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
+ (__v32hi)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__U,
+ (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
+ (__v32hi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__U,
+ (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
+ (__v8di)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
+ (__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectq_512(__U,
+ (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
+ (__v8di)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
+ (__v16si)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
+ (__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
+ (__v32hi)__C);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__U,
+ (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
+ (__v32hi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i)__builtin_ia32_selectw_512(__U,
+ (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
+ (__v32hi)_mm512_setzero_si512());
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
+
--- /dev/null
+/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIINTRIN_H
+#define __VBMIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
+ (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512(__U,
+ (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+ (__v64qi)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512(__U,
+ (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+ (__v64qi)__I);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512(__U,
+ (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_permutexvar_epi8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
+ (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
+ (__v64qi)_mm512_setzero_si512());
+}
+
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VBMIVLINTRIN_H
+#define __VBMIVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
+ (__v16qi)__I,
+ (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128(__U,
+ (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+ (__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128(__U,
+ (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+ (__v16qi)__I);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128(__U,
+ (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
+ (__v32qi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256(__U,
+ (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+ (__v32qi)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256(__U,
+ (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+ (__v32qi)__I);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256(__U,
+ (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_permutexvar_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_permutexvar_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_permutexvar_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBF16INTRIN_H
+#define __AVX512VLBF16INTRIN_H
+
+#if (__clang_major__ <= 15)
+typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#endif
+
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __B
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+/// conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
+ (__v4sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __B
+/// A 128-bit vector of [4 x float].
+/// \param __W
+/// A 128-bit vector of [8 x bfloat].
+/// \param __U
+/// A 8-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+/// conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
+ (__v8hi)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __B
+/// A 128-bit vector of [4 x float].
+/// \param __U
+/// A 8-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+/// conversion of __B, and higher 64 bits come from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+/// conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
+ (__v8sf) __B);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \param __W
+/// A 256-bit vector of [16 x bfloat].
+/// \param __U
+/// A 16-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A or __B. A 0 means element from __W.
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+/// conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
+ return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
+ (__v16hi)__W);
+}
+
+/// Convert Two Packed Single Data to One Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __B
+/// A 256-bit vector of [8 x float].
+/// \param __U
+/// A 16-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A or __B. A 0 means element is zero.
+/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
+/// conversion of __B, and higher 128 bits come from conversion of __A.
+static __inline__ __m256bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
+ return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+/// conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtneps_pbh(__m128 __A) {
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8)-1);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __W
+/// A 128-bit vector of [8 x bfloat].
+/// \param __U
+/// A 4-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+/// conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+ (__v8hi)__W,
+ (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [4 x float].
+/// \param __U
+/// A 4-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
+/// conversion of __A, and higher 64 bits are 0.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
+ (__v8hi)_mm_setzero_si128(),
+ (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_cvtneps_pbh(__m256 __A) {
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8)-1);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __W
+/// A 256-bit vector of [8 x bfloat].
+/// \param __U
+/// A 8-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A. A 0 means element from __W.
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+ (__v8hi)__W,
+ (__mmask8)__U);
+}
+
+/// Convert Packed Single Data to Packed BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [8 x float].
+/// \param __U
+/// A 8-bit mask value specifying what is chosen for each element.
+/// A 1 means conversion of __A. A 0 means element is zero.
+/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
+ (__v8hi)_mm_setzero_si128(),
+ (__mmask8)__U);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [8 x bfloat].
+/// \param __B
+/// A 128-bit vector of [8 x bfloat].
+/// \param __D
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
+ return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
+ (__v4si)__A,
+ (__v4si)__B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [8 x bfloat].
+/// \param __B
+/// A 128-bit vector of [8 x bfloat].
+/// \param __D
+/// A 128-bit vector of [4 x float].
+/// \param __U
+/// A 8-bit mask value specifying what is chosen for each element.
+/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
+ (__v4sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 128-bit vector of [8 x bfloat].
+/// \param __B
+/// A 128-bit vector of [8 x bfloat].
+/// \param __D
+/// A 128-bit vector of [4 x float].
+/// \param __U
+/// A 8-bit mask value specifying what is chosen for each element.
+/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
+ (__v4sf)_mm_setzero_si128());
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [16 x bfloat].
+/// \param __B
+/// A 256-bit vector of [16 x bfloat].
+/// \param __D
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
+ return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
+ (__v8si)__A,
+ (__v8si)__B);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [16 x bfloat].
+/// \param __B
+/// A 256-bit vector of [16 x bfloat].
+/// \param __D
+/// A 256-bit vector of [8 x float].
+/// \param __U
+/// A 16-bit mask value specifying what is chosen for each element.
+/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
+/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
+ (__v8sf)__D);
+}
+
+/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
+///
+/// \param __A
+/// A 256-bit vector of [16 x bfloat].
+/// \param __B
+/// A 256-bit vector of [16 x bfloat].
+/// \param __D
+/// A 256-bit vector of [8 x float].
+/// \param __U
+/// A 8-bit mask value specifying what is chosen for each element.
+/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
+/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
+/// __A, __B and __D
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
+ (__v8sf)_mm256_setzero_si256());
+}
+
+/// Convert One Single float Data to One BF16 Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
+///
+/// \param __A
+/// A float data.
+/// \returns A bf16 data whose sign field and exponent field keep unchanged,
+/// and fraction field is truncated to 7 bits.
+static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
+ __v4sf __V = {__A, 0, 0, 0};
+#if (__clang_major__ > 15)
+ __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
+ (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
+ return (__bf16)__R[0];
+#else
+ __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
+ (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
+ return __R[0];
+#endif
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+/// A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
+ return _mm_castsi128_ps(
+ (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __A
+/// A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
+ return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
+ (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+/// A 4-bit mask. Elements are zeroed out when the corresponding mask
+/// bit is not set.
+/// \param __A
+/// A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+ return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
+ (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __U
+/// A 8-bit mask. Elements are zeroed out when the corresponding mask
+/// bit is not set.
+/// \param __A
+/// A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
+ return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
+ (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+/// A 128-bit vector of [4 x float]. Elements are copied from __S when
+/// the corresponding mask bit is not set.
+/// \param __U
+/// A 4-bit mask. Elements are zeroed out when the corresponding mask
+/// bit is not set.
+/// \param __A
+/// A 128-bit vector of [4 x bfloat].
+/// \returns A 128-bit vector of [4 x float] come from conversion of __A
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
+ return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
+ (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
+ 16));
+}
+
+/// Convert Packed BF16 Data to Packed float Data using merging mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \param __S
+/// A 256-bit vector of [8 x float]. Elements are copied from __S when
+/// the corresponding mask bit is not set.
+/// \param __U
+/// A 8-bit mask. Elements are zeroed out when the corresponding mask
+/// bit is not set.
+/// \param __A
+/// A 128-bit vector of [8 x bfloat].
+/// \returns A 256-bit vector of [8 x float] come from conversion of __A
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
+ return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
+ (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
+ 16));
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBITALGINTRIN_H
+#define __AVX512VLBITALGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi16(__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
+ (__v16hi) _mm256_popcnt_epi16(__B),
+ (__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
+{
+ return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
+ __U,
+ __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi16(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
+ (__v8hi) _mm_popcnt_epi16(__B),
+ (__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
+{
+ return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
+ __U,
+ __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi8(__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
+ (__v32qi) _mm256_popcnt_epi8(__B),
+ (__v32qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
+{
+ return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
+ __U,
+ __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
+ (__v16qi) _mm_popcnt_epi8(__B),
+ (__v16qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
+{
+ return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
+ __U,
+ __B);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
+ (__v32qi) __B,
+ __U);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
+{
+ return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
+ __A,
+ __B);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
+ (__v16qi) __B,
+ __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
+{
+ return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
+ __A,
+ __B);
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLBWINTRIN_H
+#define __AVX512VLBWINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256)))
+
+/* Integer compare */
+
+#define _mm_cmp_epi8_mask(a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), (int)(p), \
+ (__mmask16)-1))
+
+#define _mm_mask_cmp_epi8_mask(m, a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), (int)(p), \
+ (__mmask16)(m)))
+
+#define _mm_cmp_epu8_mask(a, b, p) \
+ ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), (int)(p), \
+ (__mmask16)-1))
+
+#define _mm_mask_cmp_epu8_mask(m, a, b, p) \
+ ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), (int)(p), \
+ (__mmask16)(m)))
+
+#define _mm256_cmp_epi8_mask(a, b, p) \
+ ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), (int)(p), \
+ (__mmask32)-1))
+
+#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
+ ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), (int)(p), \
+ (__mmask32)(m)))
+
+#define _mm256_cmp_epu8_mask(a, b, p) \
+ ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), (int)(p), \
+ (__mmask32)-1))
+
+#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
+ ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
+ (__v32qi)(__m256i)(b), (int)(p), \
+ (__mmask32)(m)))
+
+#define _mm_cmp_epi16_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_epi16_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm_cmp_epu16_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_epu16_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
+ (__v8hi)(__m128i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm256_cmp_epi16_mask(a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), (int)(p), \
+ (__mmask16)-1))
+
+#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), (int)(p), \
+ (__mmask16)(m)))
+
+#define _mm256_cmp_epu16_mask(a, b, p) \
+ ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), (int)(p), \
+ (__mmask16)-1))
+
+#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
+ ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
+ (__v16hi)(__m256i)(b), (int)(p), \
+ (__mmask16)(m)))
+
+#define _mm_cmpeq_epi8_mask(A, B) \
+ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi8_mask(k, A, B) \
+ _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi8_mask(A, B) \
+ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi8_mask(k, A, B) \
+ _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi8_mask(A, B) \
+ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi8_mask(k, A, B) \
+ _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi8_mask(A, B) \
+ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi8_mask(k, A, B) \
+ _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi8_mask(A, B) \
+ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi8_mask(k, A, B) \
+ _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi8_mask(A, B) \
+ _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi8_mask(k, A, B) \
+ _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epi8_mask(A, B) \
+ _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \
+ _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi8_mask(A, B) \
+ _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi8_mask(k, A, B) \
+ _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi8_mask(A, B) \
+ _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \
+ _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi8_mask(A, B) \
+ _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi8_mask(k, A, B) \
+ _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi8_mask(A, B) \
+ _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi8_mask(k, A, B) \
+ _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi8_mask(A, B) \
+ _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \
+ _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm_cmpeq_epu8_mask(A, B) \
+ _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu8_mask(k, A, B) \
+ _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu8_mask(A, B) \
+ _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu8_mask(k, A, B) \
+ _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu8_mask(A, B) \
+ _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu8_mask(k, A, B) \
+ _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu8_mask(A, B) \
+ _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu8_mask(k, A, B) \
+ _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu8_mask(A, B) \
+ _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu8_mask(k, A, B) \
+ _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu8_mask(A, B) \
+ _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu8_mask(k, A, B) \
+ _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epu8_mask(A, B) \
+ _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \
+ _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu8_mask(A, B) \
+ _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu8_mask(k, A, B) \
+ _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu8_mask(A, B) \
+ _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \
+ _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu8_mask(A, B) \
+ _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu8_mask(k, A, B) \
+ _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu8_mask(A, B) \
+ _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu8_mask(k, A, B) \
+ _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu8_mask(A, B) \
+ _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \
+ _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm_cmpeq_epi16_mask(A, B) \
+ _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi16_mask(k, A, B) \
+ _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi16_mask(A, B) \
+ _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi16_mask(k, A, B) \
+ _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi16_mask(A, B) \
+ _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi16_mask(k, A, B) \
+ _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi16_mask(A, B) \
+ _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi16_mask(k, A, B) \
+ _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi16_mask(A, B) \
+ _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi16_mask(k, A, B) \
+ _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi16_mask(A, B) \
+ _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi16_mask(k, A, B) \
+ _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epi16_mask(A, B) \
+ _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \
+ _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi16_mask(A, B) \
+ _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi16_mask(k, A, B) \
+ _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi16_mask(A, B) \
+ _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \
+ _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi16_mask(A, B) \
+ _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi16_mask(k, A, B) \
+ _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi16_mask(A, B) \
+ _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi16_mask(k, A, B) \
+ _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi16_mask(A, B) \
+ _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \
+ _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm_cmpeq_epu16_mask(A, B) \
+ _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu16_mask(k, A, B) \
+ _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu16_mask(A, B) \
+ _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu16_mask(k, A, B) \
+ _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu16_mask(A, B) \
+ _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu16_mask(k, A, B) \
+ _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu16_mask(A, B) \
+ _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu16_mask(k, A, B) \
+ _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu16_mask(A, B) \
+ _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu16_mask(k, A, B) \
+ _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu16_mask(A, B) \
+ _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu16_mask(k, A, B) \
+ _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epu16_mask(A, B) \
+ _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \
+ _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu16_mask(A, B) \
+ _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu16_mask(k, A, B) \
+ _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu16_mask(A, B) \
+ _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \
+ _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu16_mask(A, B) \
+ _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu16_mask(k, A, B) \
+ _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu16_mask(A, B) \
+ _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu16_mask(k, A, B) \
+ _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu16_mask(A, B) \
+ _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \
+ _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_add_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_add_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_add_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_add_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_sub_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_sub_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sub_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sub_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_add_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_add_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_add_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_add_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_sub_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_sub_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sub_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sub_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mullo_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mullo_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mullo_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mullo_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+ return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+ (__v16qi) __W,
+ (__v16qi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+ return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+ (__v32qi) __W,
+ (__v32qi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+ return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+ (__v8hi) __W,
+ (__v8hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+ return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+ (__v16hi) __W,
+ (__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_abs_epi8(__A),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_abs_epi8(__A),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_abs_epi8(__A),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_abs_epi8(__A),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_abs_epi16(__A),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_abs_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_abs_epi16(__A),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_abs_epi16(__A),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packs_epi32(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packs_epi32(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packs_epi32(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packs_epi32(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packs_epi16(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packs_epi16(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packs_epi16(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packs_epi16(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packus_epi32(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_packus_epi32(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packus_epi32(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_packus_epi32(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packus_epi16(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_packus_epi16(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packus_epi16(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_packus_epi16(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epu8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_adds_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epu8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_adds_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epu16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_adds_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epu16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_adds_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_avg_epu8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_avg_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_avg_epu8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_avg_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_avg_epu16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_avg_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_avg_epu16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_avg_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_max_epu8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_max_epu8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_max_epu16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_max_epu16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm_min_epu8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
+ (__v32qi)_mm256_min_epu8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_min_epu16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_min_epu16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_shuffle_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_shuffle_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_shuffle_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_shuffle_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epu8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_subs_epu8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epu8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_subs_epu8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epu16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_subs_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epu16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_subs_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
+ (__v8hi) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__U,
+ (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
+ (__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__U,
+ (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
+ (__v8hi)__I);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__U,
+ (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
+ (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__U,
+ (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
+ (__v16hi)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__U,
+ (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
+ (__v16hi)__I);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__U,
+ (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_maddubs_epi16(__X, __Y),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_maddubs_epi16(__X, __Y),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
+ __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_maddubs_epi16(__X, __Y),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_madd_epi16(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_madd_epi16(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_madd_epi16(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_madd_epi16(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtsepi16_epi8 (__m128i __A) {
+ return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+ return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
+ return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtsepi16_epi8 (__m256i __A) {
+ return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+ return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
+ return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtusepi16_epi8 (__m128i __A) {
+ return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+ return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
+ return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtusepi16_epi8 (__m256i __A) {
+ return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ (__mmask16) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+ return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
+ return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtepi16_epi8 (__m128i __A) {
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v8hi)__A, __v8qi),
+ (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
+ return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
+ return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+ (__v16qi) _mm_setzero_si128(),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi16_epi8 (__m256i __A) {
+ return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm256_cvtepi16_epi8(__A),
+ (__v16qi)__O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
+ (__v16qi)_mm256_cvtepi16_epi8(__A),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
+{
+ __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhrs_epi16(__X, __Y),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epu16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epu16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epu16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epu16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_mulhi_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_mulhi_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_unpackhi_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_unpackhi_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_unpackhi_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_unpackhi_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_unpackhi_epi16(__A, __B),
+ (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_unpackhi_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_unpacklo_epi8(__A, __B),
+ (__v16qi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
+ (__v16qi)_mm_unpacklo_epi8(__A, __B),
+ (__v16qi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+ (__v32qi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
+ (__v32qi)_mm256_unpacklo_epi8(__A, __B),
+ (__v32qi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_unpacklo_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_unpacklo_epi16(__A, __B),
+ (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_unpacklo_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepi8_epi16(__A),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepi8_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepi8_epi16(__A),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepi8_epi16(__A),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepu8_epi16(__A),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_cvtepu8_epi16(__A),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepu8_epi16(__A),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_cvtepu8_epi16(__A),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+
+#define _mm_mask_shufflehi_epi16(W, U, A, imm) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+ (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_shufflehi_epi16(U, A, imm) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+ (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_shufflehi_epi16(U, A, imm) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_mask_shufflelo_epi16(W, U, A, imm) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+ (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_shufflelo_epi16(U, A, imm) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shufflelo_epi16((A), \
+ (imm)), \
+ (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_shufflelo_epi16(U, A, imm) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shufflelo_epi16((A), \
+ (imm)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sllv_epi16(__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sllv_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sllv_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_sllv_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sllv_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sllv_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sll_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sll_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sll_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sll_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_slli_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_slli_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
+ unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_slli_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_slli_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srlv_epi16(__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srlv_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srlv_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srlv_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srlv_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srlv_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srav_epi16(__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srav_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srav_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srav_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srav_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srav_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sra_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_sra_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sra_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_sra_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srai_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srai_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
+ unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srai_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srai_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srl_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srl_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srl_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srl_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srli_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
+ (__v8hi)_mm_srli_epi16(__A, __B),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srli_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
+ (__v16hi)_mm256_srli_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+ (__v8hi) __A,
+ (__v8hi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
+ (__v8hi) __A,
+ (__v8hi) _mm_setzero_si128 ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+ (__v16hi) __A,
+ (__v16hi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
+ (__v16hi) __A,
+ (__v16hi) _mm256_setzero_si256 ());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+ (__v16qi) __A,
+ (__v16qi) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
+ (__v16qi) __A,
+ (__v16qi) _mm_setzero_si128 ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+ (__v32qi) __A,
+ (__v32qi) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
+ (__v32qi) __A,
+ (__v32qi) _mm256_setzero_si256 ());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
+{
+ return (__m128i) __builtin_ia32_selectb_128(__M,
+ (__v16qi) _mm_set1_epi8(__A),
+ (__v16qi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
+{
+ return (__m128i) __builtin_ia32_selectb_128(__M,
+ (__v16qi) _mm_set1_epi8(__A),
+ (__v16qi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
+{
+ return (__m256i) __builtin_ia32_selectb_256(__M,
+ (__v32qi) _mm256_set1_epi8(__A),
+ (__v32qi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
+{
+ return (__m256i) __builtin_ia32_selectb_256(__M,
+ (__v32qi) _mm256_set1_epi8(__A),
+ (__v32qi) _mm256_setzero_si256());
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS128
+_mm_loadu_epi16 (void const *__P)
+{
+ struct __loadu_epi16 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi16*)__P)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadu_epi16 (void const *__P)
+{
+ struct __loadu_epi16 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi16*)__P)->__v;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS128
+_mm_loadu_epi8 (void const *__P)
+{
+ struct __loadu_epi8 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi8*)__P)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadu_epi8 (void const *__P)
+{
+ struct __loadu_epi8 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi8*)__P)->__v;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_storeu_epi16 (void *__P, __m128i __A)
+{
+ struct __storeu_epi16 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi16*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
+ (__v8hi) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS256
+_mm256_storeu_epi16 (void *__P, __m256i __A)
+{
+ struct __storeu_epi16 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi16*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
+{
+ __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
+ (__v16hi) __A,
+ (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_storeu_epi8 (void *__P, __m128i __A)
+{
+ struct __storeu_epi8 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi8*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
+{
+ __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
+ (__v16qi) __A,
+ (__mmask16) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS256
+_mm256_storeu_epi8 (void *__P, __m256i __A)
+{
+ struct __storeu_epi8 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi8*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
+{
+ __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
+ (__v32qi) __A,
+ (__mmask32) __U);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_test_epi8_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B),
+ _mm_setzero_si128());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_test_epi8_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_test_epi16_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B),
+ _mm_setzero_si128());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
+_mm256_test_epi16_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256 ());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
+_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_testn_epi8_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B),
+ _mm_setzero_si128());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_testn_epi16_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
+_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
+_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
+_mm_movepi8_mask (__m128i __A)
+{
+ return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
+}
+
+static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
+_mm256_movepi8_mask (__m256i __A)
+{
+ return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_movepi16_mask (__m128i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
+}
+
+static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
+_mm256_movepi16_mask (__m256i __A)
+{
+ return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_movm_epi8 (__mmask16 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_movm_epi8 (__mmask32 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_movm_epi16 (__mmask8 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_movm_epi16 (__mmask16 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectb_128(__M,
+ (__v16qi) _mm_broadcastb_epi8(__A),
+ (__v16qi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectb_128(__M,
+ (__v16qi) _mm_broadcastb_epi8(__A),
+ (__v16qi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectb_256(__M,
+ (__v32qi) _mm256_broadcastb_epi8(__A),
+ (__v32qi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectb_256(__M,
+ (__v32qi) _mm256_broadcastb_epi8(__A),
+ (__v32qi) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__M,
+ (__v8hi) _mm_broadcastw_epi16(__A),
+ (__v8hi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__M,
+ (__v8hi) _mm_broadcastw_epi16(__A),
+ (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__M,
+ (__v16hi) _mm256_broadcastw_epi16(__A),
+ (__v16hi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__M,
+ (__v16hi) _mm256_broadcastw_epi16(__A),
+ (__v16hi) _mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
+{
+ return (__m256i) __builtin_ia32_selectw_256 (__M,
+ (__v16hi) _mm256_set1_epi16(__A),
+ (__v16hi) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
+{
+ return (__m256i) __builtin_ia32_selectw_256(__M,
+ (__v16hi)_mm256_set1_epi16(__A),
+ (__v16hi) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
+{
+ return (__m128i) __builtin_ia32_selectw_128(__M,
+ (__v8hi) _mm_set1_epi16(__A),
+ (__v8hi) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
+{
+ return (__m128i) __builtin_ia32_selectw_128(__M,
+ (__v8hi) _mm_set1_epi16(__A),
+ (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_permutexvar_epi16(__A, __B),
+ (__v8hi) _mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
+ (__v8hi)_mm_permutexvar_epi16(__A, __B),
+ (__v8hi)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_permutexvar_epi16(__A, __B),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
+ (__v16hi)_mm256_permutexvar_epi16(__A, __B),
+ (__v16hi)__W);
+}
+
+#define _mm_mask_alignr_epi8(W, U, A, B, N) \
+ ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+ (__v16qi)(__m128i)(W)))
+
+#define _mm_maskz_alignr_epi8(U, A, B, N) \
+ ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+ (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
+ (__v16qi)_mm_setzero_si128()))
+
+#define _mm256_mask_alignr_epi8(W, U, A, B, N) \
+ ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+ (__v32qi)(__m256i)(W)))
+
+#define _mm256_maskz_alignr_epi8(U, A, B, N) \
+ ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+ (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
+ (__v32qi)_mm256_setzero_si256()))
+
+#define _mm_dbsad_epu8(A, B, imm) \
+ ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(imm)))
+
+#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
+ (__v8hi)(__m128i)(W)))
+
+#define _mm_maskz_dbsad_epu8(U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_dbsad_epu8(A, B, imm) \
+ ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
+ (__v32qi)(__m256i)(B), (int)(imm)))
+
+#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
+ (__v16hi)(__m256i)(W)))
+
+#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __AVX512VLBWINTRIN_H */
--- /dev/null
+/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLCDINTRIN_H
+#define __AVX512VLCDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+ return (__m128i) _mm_set1_epi64x((long long) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+ return (__m256i) _mm256_set1_epi64x((long long)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+ return (__m128i) _mm_set1_epi32((int)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+ return (__m256i) _mm256_set1_epi32((int)__A);
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_conflict_epi64 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_conflict_epi64(__A),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_conflict_epi64(__A),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_conflict_epi64 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_conflict_epi64(__A),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_conflict_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_conflict_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_conflict_epi32(__A),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_conflict_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_conflict_epi32 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_conflict_epi32(__A),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_conflict_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_lzcnt_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_lzcnt_epi32(__A),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_lzcnt_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_lzcnt_epi32(__A),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_lzcnt_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_lzcnt_epi64 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_lzcnt_epi64(__A),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_lzcnt_epi64(__A),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_lzcnt_epi64(__A),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_lzcnt_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __AVX512VLCDINTRIN_H */
--- /dev/null
+/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLDQINTRIN_H
+#define __AVX512VLDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
+ return (__m256i) ((__v4du) __A * (__v4du) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_mullo_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_mullo_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mullo_epi64 (__m128i __A, __m128i __B) {
+ return (__m128i) ((__v2du) __A * (__v2du) __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_mullo_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_mullo_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_andnot_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_andnot_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_andnot_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_andnot_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_andnot_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_andnot_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_andnot_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_andnot_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_and_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_and_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_and_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_and_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_and_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_and_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_and_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_and_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_xor_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_xor_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_xor_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_xor_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_xor_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_xor_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_xor_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_xor_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_or_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_or_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_or_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_or_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_or_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_or_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_or_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_or_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtpd_epi64 (__m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtpd_epi64 (__m256d __A) {
+ return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtpd_epu64 (__m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtpd_epu64 (__m256d __A) {
+ return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtps_epi64 (__m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtps_epi64 (__m128 __A) {
+ return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtps_epu64 (__m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtps_epu64 (__m128 __A) {
+ return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_cvtepi64_pd (__m128i __A) {
+ return (__m128d)__builtin_convertvector((__v2di)__A, __v2df);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_cvtepi64_pd(__A),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_cvtepi64_pd(__A),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_cvtepi64_pd (__m256i __A) {
+ return (__m256d)__builtin_convertvector((__v4di)__A, __v4df);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_cvtepi64_pd(__A),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_cvtepi64_pd(__A),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtepi64_ps (__m128i __A) {
+ return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+ (__v4sf) _mm_setzero_ps(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+ return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
+ return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+ (__v4sf) _mm_setzero_ps(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_cvtepi64_ps (__m256i __A) {
+ return (__m128)__builtin_convertvector((__v4di)__A, __v4sf);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm256_cvtepi64_ps(__A),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm256_cvtepi64_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvttpd_epi64 (__m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttpd_epi64 (__m256d __A) {
+ return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvttpd_epu64 (__m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttpd_epu64 (__m256d __A) {
+ return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
+ return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvttps_epi64 (__m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttps_epi64 (__m128 __A) {
+ return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvttps_epu64 (__m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+ (__v2di) _mm_setzero_si128(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttps_epu64 (__m128 __A) {
+ return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
+ return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+ (__v4di) _mm256_setzero_si256(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_cvtepu64_pd (__m128i __A) {
+ return (__m128d)__builtin_convertvector((__v2du)__A, __v2df);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_cvtepu64_pd(__A),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_cvtepu64_pd(__A),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_cvtepu64_pd (__m256i __A) {
+ return (__m256d)__builtin_convertvector((__v4du)__A, __v4df);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_cvtepu64_pd(__A),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_cvtepu64_pd(__A),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtepu64_ps (__m128i __A) {
+ return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+ (__v4sf) _mm_setzero_ps(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+ return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
+ return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+ (__v4sf) _mm_setzero_ps(),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_cvtepu64_ps (__m256i __A) {
+ return (__m128)__builtin_convertvector((__v4du)__A, __v4sf);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm256_cvtepu64_ps(__A),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm256_cvtepu64_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+}
+
+#define _mm_range_pd(A, B, C) \
+ ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1))
+
+#define _mm_mask_range_pd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_range_pd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm256_range_pd(A, B, C) \
+ ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_range_pd(W, U, A, B, C) \
+ ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_range_pd(U, A, B, C) \
+ ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm_range_ps(A, B, C) \
+ ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1))
+
+#define _mm_mask_range_ps(W, U, A, B, C) \
+ ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_range_ps(U, A, B, C) \
+ ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm256_range_ps(A, B, C) \
+ ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_range_ps(W, U, A, B, C) \
+ ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_range_ps(U, A, B, C) \
+ ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm_reduce_pd(A, B) \
+ ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1))
+
+#define _mm_mask_reduce_pd(W, U, A, B) \
+ ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_reduce_pd(U, A, B) \
+ ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm256_reduce_pd(A, B) \
+ ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_reduce_pd(W, U, A, B) \
+ ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_reduce_pd(U, A, B) \
+ ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm_reduce_ps(A, B) \
+ ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1))
+
+#define _mm_mask_reduce_ps(W, U, A, B) \
+ ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ps(U, A, B) \
+ ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm256_reduce_ps(A, B) \
+ ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_reduce_ps(W, U, A, B) \
+ ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_reduce_ps(U, A, B) \
+ ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U)))
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_movepi32_mask (__m128i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_movepi32_mask (__m256i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_movm_epi32 (__mmask8 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_movm_epi32 (__mmask8 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_movm_epi64 (__mmask8 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_movm_epi64 (__mmask8 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_movepi64_mask (__m128i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_movepi64_mask (__m256i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_broadcast_f32x2 (__m128 __A)
+{
+ return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+ 0, 1, 0, 1, 0, 1, 0, 1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+ (__v8sf)_mm256_broadcast_f32x2(__A),
+ (__v8sf)__O);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+ (__v8sf)_mm256_broadcast_f32x2(__A),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_broadcast_f64x2(__m128d __A)
+{
+ return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
+ 0, 1, 0, 1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
+ (__v4df)_mm256_broadcast_f64x2(__A),
+ (__v4df)__O);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
+ (__v4df)_mm256_broadcast_f64x2(__A),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_broadcast_i32x2 (__m128i __A)
+{
+ return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+ 0, 1, 0, 1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_broadcast_i32x2(__A),
+ (__v4si)__O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_broadcast_i32x2(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcast_i32x2 (__m128i __A)
+{
+ return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+ 0, 1, 0, 1, 0, 1, 0, 1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_broadcast_i32x2(__A),
+ (__v8si)__O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_broadcast_i32x2(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcast_i64x2(__m128i __A)
+{
+ return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
+ 0, 1, 0, 1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_broadcast_i64x2(__A),
+ (__v4di)__O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_broadcast_i64x2(__A),
+ (__v4di)_mm256_setzero_si256());
+}
+
+#define _mm256_extractf64x2_pd(A, imm) \
+ ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_undefined_pd(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
+ ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
+ ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm256_extracti64x2_epi64(A, imm) \
+ ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_undefined_si128(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+ (int)(imm), \
+ (__v2di)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
+ (int)(imm), \
+ (__v2di)_mm_setzero_si128(), \
+ (__mmask8)(U)))
+
+#define _mm256_insertf64x2(A, B, imm) \
+ ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
+ (__v2df)(__m128d)(B), (int)(imm)))
+
+#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_insertf64x2(U, A, B, imm) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm256_inserti64x2(A, B, imm) \
+ ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
+ (__v2di)(__m128i)(B), (int)(imm)))
+
+#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+ (__v4di)(__m256i)(W)))
+
+#define _mm256_maskz_inserti64x2(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
+ (__v4di)_mm256_setzero_si256()))
+
+#define _mm_mask_fpclass_pd_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_fpclass_pd_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm256_mask_fpclass_pd_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_fpclass_pd_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_fpclass_ps_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_fpclass_ps_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm256_mask_fpclass_ps_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_fpclass_ps_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
+ (__mmask8)-1))
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLFP16INTRIN_H
+#define __AVX512VLFP16INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512fp16, avx512vl"), \
+ __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512fp16, avx512vl"), \
+ __min_vector_width__(128)))
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
+ return __a[0];
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
+ return __a[0];
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
+ return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
+}
+
+static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
+ return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
+ return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
+ __h, __h, __h, __h, __h, __h, __h, __h};
+}
+
+static __inline __m128h __DEFAULT_FN_ATTRS128
+_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+ _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
+ return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256
+_mm256_set1_pch(_Float16 _Complex h) {
+ return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h));
+}
+
+static __inline __m128h __DEFAULT_FN_ATTRS128
+_mm_set1_pch(_Float16 _Complex h) {
+ return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h));
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256
+_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
+ _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
+ _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
+ _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
+ return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
+ __h10, __h9, __h8, __h7, __h6, __h5,
+ __h4, __h3, __h2, __h1};
+}
+
+#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
+ _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
+
+#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
+ h14, h15, h16) \
+ _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
+ (h7), (h6), (h5), (h4), (h3), (h2), (h1))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)((__v16hf)__A + (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)((__v8hf)__A + (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
+ (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)((__v16hf)__A - (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)((__v8hf)__A - (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
+ (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)((__v16hf)__A * (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)((__v8hf)__A * (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
+ (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)((__v16hf)__A / (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)((__v8hf)__A / (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
+ (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
+ (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
+ (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
+ (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
+ (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
+ return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
+ return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
+ return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
+ return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
+}
+
+#define _mm256_cmp_ph_mask(a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpph256_mask( \
+ (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
+
+#define _mm256_mask_cmp_ph_mask(m, a, b, p) \
+ ((__mmask16)__builtin_ia32_cmpph256_mask( \
+ (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
+
+#define _mm_cmp_ph_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpph128_mask( \
+ (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
+
+#define _mm_mask_cmp_ph_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpph128_mask( \
+ (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
+ return (__m256h)__builtin_ia32_rcpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rcpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
+ return (__m128h)__builtin_ia32_rcpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_rcpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
+ return (__m256h)__builtin_ia32_rsqrtph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_rsqrtph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
+ return (__m128h)__builtin_ia32_rsqrtph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_rsqrtph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
+ return (__m128h)__builtin_ia32_getexpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
+ return (__m128h)__builtin_ia32_getexpph128_mask(
+ (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
+ return (__m256h)__builtin_ia32_getexpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_getexpph256_mask(
+ (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+#define _mm_getmant_ph(A, B, C) \
+ ((__m128h)__builtin_ia32_getmantph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_ph(W, U, A, B, C) \
+ ((__m128h)__builtin_ia32_getmantph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ph(U, A, B, C) \
+ ((__m128h)__builtin_ia32_getmantph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U)))
+
+#define _mm256_getmant_ph(A, B, C) \
+ ((__m256h)__builtin_ia32_getmantph256_mask( \
+ (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
+ (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
+
+#define _mm256_mask_getmant_ph(W, U, A, B, C) \
+ ((__m256h)__builtin_ia32_getmantph256_mask( \
+ (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_getmant_ph(U, A, B, C) \
+ ((__m256h)__builtin_ia32_getmantph256_mask( \
+ (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
+ (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefph128_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_scalefph128_mask(
+ (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_scalefph256_mask(
+ (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
+ (__v16hf)__W, (__mmask16)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_scalefph256_mask(
+ (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
+}
+
+#define _mm_roundscale_ph(A, imm) \
+ ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1))
+
+#define _mm_mask_roundscale_ph(W, U, A, imm) \
+ ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
+
+#define _mm_maskz_roundscale_ph(U, A, imm) \
+ ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U)))
+
+#define _mm256_roundscale_ph(A, imm) \
+ ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
+ (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)-1))
+
+#define _mm256_mask_roundscale_ph(W, U, A, imm) \
+ ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
+ (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_roundscale_ph(U, A, imm) \
+ ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
+ (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)(U)))
+
+#define _mm_reduce_ph(A, imm) \
+ ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1))
+
+#define _mm_mask_reduce_ph(W, U, A, imm) \
+ ((__m128h)__builtin_ia32_reduceph128_mask( \
+ (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ph(U, A, imm) \
+ ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)(U)))
+
+#define _mm256_reduce_ph(A, imm) \
+ ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+ (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)-1))
+
+#define _mm256_mask_reduce_ph(W, U, A, imm) \
+ ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+ (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_reduce_ph(U, A, imm) \
+ ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
+ (__v16hf)_mm256_setzero_ph(), \
+ (__mmask16)(U)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
+ return __builtin_ia32_sqrtph((__v8hf)__a);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
+ __m128h __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
+ return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
+ return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
+ (__v16hf)_mm256_sqrt_ph(__A),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+#define _mm_mask_fpclass_ph_mask(U, A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
+ (int)(imm), (__mmask8)(U)))
+
+#define _mm_fpclass_ph_mask(A, imm) \
+ ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
+ (int)(imm), (__mmask8)-1))
+
+#define _mm256_mask_fpclass_ph_mask(U, A, imm) \
+ ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
+ (int)(imm), (__mmask16)(U)))
+
+#define _mm256_fpclass_ph_mask(A, imm) \
+ ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
+ (int)(imm), (__mmask16)-1))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
+ (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
+ __mmask8 __U,
+ __m128d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
+ (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
+ (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
+ return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
+ (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
+ return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
+ (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
+ return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
+ (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
+ return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
+ (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
+ return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
+ return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
+ (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2w128_mask(
+ (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2w128_mask(
+ (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epi16(__m256h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2w256_mask(
+ (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2w256_mask(
+ (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2w128_mask(
+ (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2w128_mask(
+ (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epi16(__m256h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2w256_mask(
+ (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2w256_mask(
+ (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
+ return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_cvtepi16_ph(__m256i __A) {
+ return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
+ return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
+ (__v16hf)_mm256_cvtepi16_ph(__A),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
+ (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
+ (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epu16(__m256h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
+ (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
+ (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
+ (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
+ (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epu16(__m256h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
+ (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
+ (__mmask16)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
+ (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
+ return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_cvtepu16_ph(__m256i __A) {
+ return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
+ return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
+ (__v16hf)_mm256_cvtepu16_ph(__A),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
+ (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
+ (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epi32(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
+ (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
+ (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
+ (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
+ (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epu32(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
+ (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
+ (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
+ (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
+ (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepi32_ph(__m256i __A) {
+ return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
+ (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
+ (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepu32_ph(__m256i __A) {
+ return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
+ (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
+ (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epi32(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
+ (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
+ (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
+ (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
+ (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epu32(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
+ (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
+ (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
+ (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
+ (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepi64_ph(__m256i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
+ (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
+ (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
+ (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
+ (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epi64(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
+ (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
+ (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
+ (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
+ (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_cvtepu64_ph(__m256i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
+ (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
+ return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
+ (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
+ (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
+ (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtph_epu64(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
+ (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
+ (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
+ (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
+ (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epi64(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
+ (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
+ (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
+ (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
+ return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
+ (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttph_epu64(__m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
+ (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
+ return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
+ (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
+ return (__m128)__builtin_ia32_vcvtph2psx128_mask(
+ (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
+ __mmask8 __U,
+ __m128h __A) {
+ return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
+ return (__m128)__builtin_ia32_vcvtph2psx128_mask(
+ (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
+ return (__m256)__builtin_ia32_vcvtph2psx256_mask(
+ (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
+ return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
+ return (__m256)__builtin_ia32_vcvtph2psx256_mask(
+ (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
+ return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
+ (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
+ __mmask8 __U,
+ __m128 __A) {
+ return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
+ return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
+ (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
+ return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
+ (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
+ return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
+ (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
+ return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
+ (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
+ __mmask8 __U,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
+ -(__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
+ __mmask8 __U,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
+ -(__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
+ -(__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)_mm_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
+ -(__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)_mm256_setzero_ph());
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
+ (__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
+ (__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
+ -(__v8hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_selectph_128(
+ (__mmask8)__U,
+ __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
+ (__v8hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
+ -(__v16hf)__C);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
+ return (__m256h)__builtin_ia32_selectph_256(
+ (__mmask16)__U,
+ __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
+ (__v16hf)__C);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfcmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_vfcmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfcmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ __U,
+ __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
+ (__v4sf)__C, (__mmask8)__U),
+ (__v4sf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C, (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ __U,
+ __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
+ (__mmask8)__U),
+ (__v8sf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
+ return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
+ __mmask8 __U,
+ __m128h __A,
+ __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_vfmulcph128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
+ __m256h __B) {
+ return (__m256h)__builtin_ia32_vfmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_vfmulcph256_mask(
+ (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
+ __m128h __B,
+ __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)-1);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_selectps_128(
+ __U,
+ __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
+ (__mmask8)__U),
+ (__v4sf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
+ return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
+ return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
+ __m256h __B,
+ __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C, (__mmask8)-1);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_selectps_256(
+ __U,
+ __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
+ (__mmask8)__U),
+ (__v8sf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
+ return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
+ return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C, (__mmask8)__U);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
+ __m128h __A,
+ __m128h __W) {
+ return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
+ (__v8hf)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
+ return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
+ (__v16hf)__A);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
+ return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
+ (__v8hi)__B);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
+ return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
+ (__v16hi)__B);
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_permutexvar_ph(__m128i __A, __m128h __B) {
+ return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
+ return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_add_ph(__m256h __W) {
+ return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_mul_ph(__m256h __W) {
+ return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_ph(__m256h __V) {
+ return __builtin_ia32_reduce_fmax_ph256(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_ph(__m256h __V) {
+ return __builtin_ia32_reduce_fmin_ph256(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_add_ph(__m128h __W) {
+ return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_mul_ph(__m128h __W) {
+ return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_max_ph(__m128h __V) {
+ return __builtin_ia32_reduce_fmax_ph128(__V);
+}
+
+static __inline__ _Float16 __DEFAULT_FN_ATTRS128
+_mm_reduce_min_ph(__m128h __V) {
+ return __builtin_ia32_reduce_fmin_ph128(__V);
+}
+
+// intrinsics below are alias for f*mul_*ch
+#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
+#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
+#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
+#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
+#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
+#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
+
+#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
+#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
+#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
+#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
+#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
+#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLINTRIN_H
+#define __AVX512VLINTRIN_H
+
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256)))
+
+typedef short __v2hi __attribute__((__vector_size__(4)));
+typedef char __v4qi __attribute__((__vector_size__(4)));
+typedef char __v2qi __attribute__((__vector_size__(2)));
+
+/* Integer compare */
+
+#define _mm_cmpeq_epi32_mask(A, B) \
+ _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi32_mask(k, A, B) \
+ _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi32_mask(A, B) \
+ _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi32_mask(k, A, B) \
+ _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi32_mask(A, B) \
+ _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi32_mask(k, A, B) \
+ _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi32_mask(A, B) \
+ _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi32_mask(k, A, B) \
+ _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi32_mask(A, B) \
+ _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi32_mask(k, A, B) \
+ _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi32_mask(A, B) \
+ _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi32_mask(k, A, B) \
+ _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epi32_mask(A, B) \
+ _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \
+ _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi32_mask(A, B) \
+ _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi32_mask(k, A, B) \
+ _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi32_mask(A, B) \
+ _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \
+ _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi32_mask(A, B) \
+ _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi32_mask(k, A, B) \
+ _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi32_mask(A, B) \
+ _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi32_mask(k, A, B) \
+ _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi32_mask(A, B) \
+ _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \
+ _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm_cmpeq_epu32_mask(A, B) \
+ _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu32_mask(k, A, B) \
+ _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu32_mask(A, B) \
+ _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu32_mask(k, A, B) \
+ _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu32_mask(A, B) \
+ _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu32_mask(k, A, B) \
+ _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu32_mask(A, B) \
+ _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu32_mask(k, A, B) \
+ _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu32_mask(A, B) \
+ _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu32_mask(k, A, B) \
+ _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu32_mask(A, B) \
+ _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu32_mask(k, A, B) \
+ _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epu32_mask(A, B) \
+ _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \
+ _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu32_mask(A, B) \
+ _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu32_mask(k, A, B) \
+ _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu32_mask(A, B) \
+ _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \
+ _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu32_mask(A, B) \
+ _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu32_mask(k, A, B) \
+ _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu32_mask(A, B) \
+ _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu32_mask(k, A, B) \
+ _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu32_mask(A, B) \
+ _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \
+ _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm_cmpeq_epi64_mask(A, B) \
+ _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epi64_mask(k, A, B) \
+ _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epi64_mask(A, B) \
+ _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epi64_mask(k, A, B) \
+ _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epi64_mask(A, B) \
+ _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epi64_mask(k, A, B) \
+ _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epi64_mask(A, B) \
+ _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epi64_mask(k, A, B) \
+ _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epi64_mask(A, B) \
+ _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epi64_mask(k, A, B) \
+ _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epi64_mask(A, B) \
+ _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epi64_mask(k, A, B) \
+ _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epi64_mask(A, B) \
+ _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \
+ _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epi64_mask(A, B) \
+ _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epi64_mask(k, A, B) \
+ _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epi64_mask(A, B) \
+ _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \
+ _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epi64_mask(A, B) \
+ _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epi64_mask(k, A, B) \
+ _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epi64_mask(A, B) \
+ _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epi64_mask(k, A, B) \
+ _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epi64_mask(A, B) \
+ _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \
+ _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm_cmpeq_epu64_mask(A, B) \
+ _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm_mask_cmpeq_epu64_mask(k, A, B) \
+ _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm_cmpge_epu64_mask(A, B) \
+ _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm_mask_cmpge_epu64_mask(k, A, B) \
+ _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm_cmpgt_epu64_mask(A, B) \
+ _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm_mask_cmpgt_epu64_mask(k, A, B) \
+ _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm_cmple_epu64_mask(A, B) \
+ _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm_mask_cmple_epu64_mask(k, A, B) \
+ _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm_cmplt_epu64_mask(A, B) \
+ _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm_mask_cmplt_epu64_mask(k, A, B) \
+ _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm_cmpneq_epu64_mask(A, B) \
+ _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm_mask_cmpneq_epu64_mask(k, A, B) \
+ _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
+
+#define _mm256_cmpeq_epu64_mask(A, B) \
+ _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
+#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \
+ _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
+#define _mm256_cmpge_epu64_mask(A, B) \
+ _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
+#define _mm256_mask_cmpge_epu64_mask(k, A, B) \
+ _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
+#define _mm256_cmpgt_epu64_mask(A, B) \
+ _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
+#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \
+ _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
+#define _mm256_cmple_epu64_mask(A, B) \
+ _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
+#define _mm256_mask_cmple_epu64_mask(k, A, B) \
+ _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
+#define _mm256_cmplt_epu64_mask(A, B) \
+ _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
+#define _mm256_mask_cmplt_epu64_mask(k, A, B) \
+ _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
+#define _mm256_cmpneq_epu64_mask(A, B) \
+ _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
+#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \
+ _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_add_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_add_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_add_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_add_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sub_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sub_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sub_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sub_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_add_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_add_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_add_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_add_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sub_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sub_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sub_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sub_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epi32(__X, __Y),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epi32(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epi32(__X, __Y),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epi32(__X, __Y),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epu32(__X, __Y),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_mul_epu32(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epu32(__X, __Y),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_mul_epu32(__X, __Y),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_mullo_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_mullo_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_mullo_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_mullo_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_and_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8su)__a & (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_and_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_and_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v4su)__a & (__v4su)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_and_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_andnot_epi32(__m256i __A, __m256i __B)
+{
+ return (__m256i)(~(__v8su)__A & (__v8su)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_andnot_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(),
+ __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_andnot_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i)(~(__v4su)__A & (__v4su)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_andnot_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_or_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8su)__a | (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_or_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_or_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v4su)__a | (__v4su)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_or_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_xor_epi32(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v8su)__a ^ (__v8su)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_xor_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_xor_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v4su)__a ^ (__v4su)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_xor_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_and_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a & (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_and_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_and_epi64(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a & (__v2du)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_and_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_andnot_epi64(__m256i __A, __m256i __B)
+{
+ return (__m256i)(~(__v4du)__A & (__v4du)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_andnot_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(),
+ __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_andnot_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i)(~(__v2du)__A & (__v2du)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_andnot_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_or_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a | (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_or_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_or_epi64(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a | (__v2du)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_or_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_xor_epi64(__m256i __a, __m256i __b)
+{
+ return (__m256i)((__v4du)__a ^ (__v4du)__b);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_xor_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_xor_epi64(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a ^ (__v2du)__b);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_xor_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B);
+}
+
+#define _mm_cmp_epi32_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_epi32_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm_cmp_epu32_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_epu32_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
+ (__v4si)(__m128i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm256_cmp_epi32_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm256_cmp_epu32_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
+ (__v8si)(__m256i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm_cmp_epi64_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_epi64_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm_cmp_epu64_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_epu64_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
+ (__v2di)(__m128i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm256_cmp_epi64_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm256_cmp_epu64_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
+ (__v4di)(__m256i)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm256_cmp_ps_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+ (__v8sf)(__m256)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm256_mask_cmp_ps_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
+ (__v8sf)(__m256)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm256_cmp_pd_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+ (__v4df)(__m256d)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm256_mask_cmp_pd_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
+ (__v4df)(__m256d)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm_cmp_ps_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+ (__v4sf)(__m128)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_ps_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
+ (__v4sf)(__m128)(b), (int)(p), \
+ (__mmask8)(m)))
+
+#define _mm_cmp_pd_mask(a, b, p) \
+ ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+ (__v2df)(__m128d)(b), (int)(p), \
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_pd_mask(m, a, b, p) \
+ ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
+ (__v2df)(__m128d)(b), (int)(p), \
+ (__mmask8)(m)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df) __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C),
+ (__v2df) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd (-(__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df) __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd (-(__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd (-(__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df) __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C),
+ (__v4df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df) __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf) __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps (-(__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf) __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps (-(__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps (-(__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf) __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf) __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df) __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C),
+ (__v2df) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df) __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C),
+ (__v4df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf) __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf) __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C),
+ (__v2df) __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C),
+ (__v4df) __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf) __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf) __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C),
+ (__v2df) __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C),
+ (__v4df) __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf) __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf) __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ -(__v2df) __B,
+ (__v2df) __C),
+ (__v2df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ -(__v4df) __B,
+ (__v4df) __C),
+ (__v4df) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ -(__v4sf) __B,
+ (__v4sf) __C),
+ (__v4sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ -(__v8sf) __B,
+ (__v8sf) __C),
+ (__v8sf) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ -(__v2df) __B,
+ -(__v2df) __C),
+ (__v2df) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
+ __builtin_ia32_vfmaddpd ((__v2df) __A,
+ -(__v2df) __B,
+ -(__v2df) __C),
+ (__v2df) __C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ -(__v4df) __B,
+ -(__v4df) __C),
+ (__v4df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
+ __builtin_ia32_vfmaddpd256 ((__v4df) __A,
+ -(__v4df) __B,
+ -(__v4df) __C),
+ (__v4df) __C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ -(__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
+ __builtin_ia32_vfmaddps ((__v4sf) __A,
+ -(__v4sf) __B,
+ -(__v4sf) __C),
+ (__v4sf) __C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ -(__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
+ __builtin_ia32_vfmaddps256 ((__v8sf) __A,
+ -(__v8sf) __B,
+ -(__v8sf) __C),
+ (__v8sf) __C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_add_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_add_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_add_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_add_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_add_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_add_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_add_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_add_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
+ return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+ (__v4si) __W,
+ (__v4si) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
+ return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+ (__v8si) __W,
+ (__v8si) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
+ return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+ (__v2df) __W,
+ (__v2df) __A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
+ return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+ (__v4df) __W,
+ (__v4df) __A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
+ return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+ (__v4sf) __W,
+ (__v4sf) __A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
+ return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+ (__v8sf) __W,
+ (__v8sf) __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
+ return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+ (__v2di) __W,
+ (__v2di) __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
+ return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+ (__v4di) __W,
+ (__v4di) __A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+ return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
+ return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+ return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
+ return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+ return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
+ return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+ return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
+ return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
+ __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+ (__v2df) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
+ __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+ (__v4df) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
+ __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+ (__v2di) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
+ __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+ (__v4di) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
+ __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
+ __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+ (__v8sf) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
+ __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+ (__v4si) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
+ __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+ (__v8si) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepi32_pd(__A),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepi32_pd(__A),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepi32_pd(__A),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepi32_pd(__A),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_cvtepi32_ps(__A),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_cvtepi32_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_cvtepi32_ps(__A),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_cvtepi32_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm256_cvtpd_epi32(__A),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm256_cvtpd_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
+ return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
+ return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm256_cvtpd_ps(__A),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm256_cvtpd_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtpd_epu32 (__m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtpd_epu32 (__m256d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
+ return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtps_epi32(__A),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtps_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtps_epi32(__A),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtps_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_cvtps_pd(__A),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_cvtps_pd(__A),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_cvtps_pd(__A),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_cvtps_pd(__A),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtps_epu32 (__m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvtps_epu32 (__m256 __A) {
+ return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+ return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
+ return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm256_cvttpd_epi32(__A),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm256_cvttpd_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvttpd_epu32 (__m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvttpd_epu32 (__m256d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
+ return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvttps_epi32(__A),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvttps_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvttps_epi32(__A),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvttps_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvttps_epu32 (__m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
+ return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cvttps_epu32 (__m256 __A) {
+ return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
+ return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
+ return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_cvtepu32_pd (__m128i __A) {
+ return (__m128d) __builtin_convertvector(
+ __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepu32_pd(__A),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
+ (__v2df)_mm_cvtepu32_pd(__A),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_cvtepu32_pd (__m128i __A) {
+ return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepu32_pd(__A),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
+ (__v4df)_mm256_cvtepu32_pd(__A),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtepu32_ps (__m128i __A) {
+ return (__m128)__builtin_convertvector((__v4su)__A, __v4sf);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_cvtepu32_ps(__A),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_cvtepu32_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtepu32_ps (__m256i __A) {
+ return (__m256)__builtin_convertvector((__v8su)__A, __v8sf);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_cvtepu32_ps(__A),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_cvtepu32_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_div_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_div_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_div_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_div_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_div_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_div_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_div_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_div_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+ return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
+ return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+ return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
+ return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
+ return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
+ (__v2df) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+ return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
+ return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
+ (__v4df) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
+ return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
+ return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
+ (__v2di) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+ return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+ void const *__P) {
+ return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
+ (__v4di) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
+ return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
+ return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+ return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
+ return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
+ return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
+ return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
+ (__v4si) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+ return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+ void const *__P) {
+ return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
+ (__v8si) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
+ return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+ return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
+ return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+ return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
+ return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
+ return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
+ return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_getexp_pd (__m128d __A) {
+ return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
+ return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
+ return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_getexp_pd (__m256d __A) {
+ return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
+ return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
+ return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_getexp_ps (__m128 __A) {
+ return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
+ return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
+ return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_getexp_ps (__m256 __A) {
+ return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
+ return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
+ return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_max_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_max_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_max_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_max_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_max_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_max_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_max_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_max_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_min_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_min_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_min_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_min_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_min_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_min_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_min_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_min_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_mul_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_mul_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_mul_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_mul_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_mul_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_mul_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_mul_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_mul_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_abs_epi32(__A),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_abs_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_abs_epi32(__A),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_abs_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_abs_epi64 (__m128i __A) {
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pabsq128((__v2di)__A);
+#else
+ return (__m128i)__builtin_elementwise_abs((__v2di)__A);
+#endif
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_abs_epi64(__A),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_abs_epi64(__A),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_abs_epi64 (__m256i __A) {
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pabsq256 ((__v4di)__A);
+#else
+ return (__m256i)__builtin_elementwise_abs((__v4di)__A);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_abs_epi64(__A),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_abs_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_max_epi64 (__m128i __A, __m128i __B) {
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pmaxsq128((__v2di)__A, (__v2di)__B);
+#else
+ return (__m128i)__builtin_elementwise_max((__v2di)__A, (__v2di)__B);
+#endif
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_max_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_max_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epi64 (__m256i __A, __m256i __B) {
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxsq256((__v4di)__A, (__v4di)__B);
+#else
+ return (__m256i)__builtin_elementwise_max((__v4di)__A, (__v4di)__B);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_max_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_max_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epu32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_max_epu32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epu32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_max_epu32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_max_epu64 (__m128i __A, __m128i __B) {
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pmaxuq128((__v2di)__A, (__v2di)__B);
+#else
+ return (__m128i)__builtin_elementwise_max((__v2du)__A, (__v2du)__B);
+#endif
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_max_epu64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_max_epu64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epu64 (__m256i __A, __m256i __B) {
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pmaxuq256((__v4di)__A, (__v4di)__B);
+#else
+ return (__m256i)__builtin_elementwise_max((__v4du)__A, (__v4du)__B);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_max_epu64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_max_epu64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_min_epi64 (__m128i __A, __m128i __B) {
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pminsq128((__v2di)__A, (__v2di)__B);
+#else
+ return (__m128i)__builtin_elementwise_min((__v2di)__A, (__v2di)__B);
+#endif
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_min_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_min_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epi64 (__m256i __A, __m256i __B) {
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminsq256((__v4di)__A, (__v4di)__B);
+#else
+ return (__m256i)__builtin_elementwise_min((__v4di)__A, (__v4di)__B);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_min_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_min_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epu32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm_min_epu32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epu32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_min_epu32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_min_epu64 (__m128i __A, __m128i __B) {
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pminuq128((__v2di)__A, (__v2di)__B);
+#else
+ return (__m128i)__builtin_elementwise_min((__v2du)__A, (__v2du)__B);
+#endif
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_min_epu64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
+ (__v2di)_mm_min_epu64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epu64 (__m256i __A, __m256i __B) {
+#if (__clang_major__ < 14)
+ return (__m256i)__builtin_ia32_pminuq256((__v4di)__A, (__v4di)__B);
+#else
+ return (__m256i)__builtin_elementwise_min((__v4du)__A, (__v4du)__B);
+#endif
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_min_epu64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_min_epu64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+#define _mm_roundscale_pd(A, imm) \
+ ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1))
+
+
+#define _mm_mask_roundscale_pd(W, U, A, imm) \
+ ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+ (int)(imm), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+
+#define _mm_maskz_roundscale_pd(U, A, imm) \
+ ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
+ (int)(imm), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U)))
+
+
+#define _mm256_roundscale_pd(A, imm) \
+ ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)-1))
+
+
+#define _mm256_mask_roundscale_pd(W, U, A, imm) \
+ ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+
+#define _mm256_maskz_roundscale_pd(U, A, imm) \
+ ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
+ (int)(imm), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm_roundscale_ps(A, imm) \
+ ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1))
+
+
+#define _mm_mask_roundscale_ps(W, U, A, imm) \
+ ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+
+#define _mm_maskz_roundscale_ps(U, A, imm) \
+ ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm256_roundscale_ps(A, imm) \
+ ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_roundscale_ps(W, U, A, imm) \
+ ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+
+#define _mm256_maskz_roundscale_ps(U, A, imm) \
+ ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_scalef_pd (__m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B) {
+ return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_scalef_pd (__m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B) {
+ return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_scalef_ps (__m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_scalef_ps (__m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256 __B) {
+ return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+#define _mm_i64scatter_pd(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)-1, \
+ (__v2di)(__m128i)(index), \
+ (__v2df)(__m128d)(v1), (int)(scale))
+
+#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)(mask), \
+ (__v2di)(__m128i)(index), \
+ (__v2df)(__m128d)(v1), (int)(scale))
+
+#define _mm_i64scatter_epi64(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)-1, \
+ (__v2di)(__m128i)(index), \
+ (__v2di)(__m128i)(v1), (int)(scale))
+
+#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)(mask), \
+ (__v2di)(__m128i)(index), \
+ (__v2di)(__m128i)(v1), (int)(scale))
+
+#define _mm256_i64scatter_pd(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)-1, \
+ (__v4di)(__m256i)(index), \
+ (__v4df)(__m256d)(v1), (int)(scale))
+
+#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)(mask), \
+ (__v4di)(__m256i)(index), \
+ (__v4df)(__m256d)(v1), (int)(scale))
+
+#define _mm256_i64scatter_epi64(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)-1, \
+ (__v4di)(__m256i)(index), \
+ (__v4di)(__m256i)(v1), (int)(scale))
+
+#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)(mask), \
+ (__v4di)(__m256i)(index), \
+ (__v4di)(__m256i)(v1), (int)(scale))
+
+#define _mm_i64scatter_ps(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)-1, \
+ (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+ (int)(scale))
+
+#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)(mask), \
+ (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
+ (int)(scale))
+
+#define _mm_i64scatter_epi32(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)-1, \
+ (__v2di)(__m128i)(index), \
+ (__v4si)(__m128i)(v1), (int)(scale))
+
+#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)(mask), \
+ (__v2di)(__m128i)(index), \
+ (__v4si)(__m128i)(v1), (int)(scale))
+
+#define _mm256_i64scatter_ps(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)-1, \
+ (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+ (int)(scale))
+
+#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)(mask), \
+ (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
+ (int)(scale))
+
+#define _mm256_i64scatter_epi32(addr, index, v1, scale) \
+ __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)-1, \
+ (__v4di)(__m256i)(index), \
+ (__v4si)(__m128i)(v1), (int)(scale))
+
+#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
+ __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)(mask), \
+ (__v4di)(__m256i)(index), \
+ (__v4si)(__m128i)(v1), (int)(scale))
+
+#define _mm_i32scatter_pd(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)-1, \
+ (__v4si)(__m128i)(index), \
+ (__v2df)(__m128d)(v1), (int)(scale))
+
+#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)(mask), \
+ (__v4si)(__m128i)(index), \
+ (__v2df)(__m128d)(v1), (int)(scale))
+
+#define _mm_i32scatter_epi64(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)-1, \
+ (__v4si)(__m128i)(index), \
+ (__v2di)(__m128i)(v1), (int)(scale))
+
+#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)(mask), \
+ (__v4si)(__m128i)(index), \
+ (__v2di)(__m128i)(v1), (int)(scale))
+
+#define _mm256_i32scatter_pd(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)-1, \
+ (__v4si)(__m128i)(index), \
+ (__v4df)(__m256d)(v1), (int)(scale))
+
+#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)(mask), \
+ (__v4si)(__m128i)(index), \
+ (__v4df)(__m256d)(v1), (int)(scale))
+
+#define _mm256_i32scatter_epi64(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)-1, \
+ (__v4si)(__m128i)(index), \
+ (__v4di)(__m256i)(v1), (int)(scale))
+
+#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)(mask), \
+ (__v4si)(__m128i)(index), \
+ (__v4di)(__m256i)(v1), (int)(scale))
+
+#define _mm_i32scatter_ps(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)-1, \
+ (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+ (int)(scale))
+
+#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)(mask), \
+ (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
+ (int)(scale))
+
+#define _mm_i32scatter_epi32(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)-1, \
+ (__v4si)(__m128i)(index), \
+ (__v4si)(__m128i)(v1), (int)(scale))
+
+#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)(mask), \
+ (__v4si)(__m128i)(index), \
+ (__v4si)(__m128i)(v1), (int)(scale))
+
+#define _mm256_i32scatter_ps(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)-1, \
+ (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+ (int)(scale))
+
+#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)(mask), \
+ (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
+ (int)(scale))
+
+#define _mm256_i32scatter_epi32(addr, index, v1, scale) \
+ __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)-1, \
+ (__v8si)(__m256i)(index), \
+ (__v8si)(__m256i)(v1), (int)(scale))
+
+#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
+ __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)(mask), \
+ (__v8si)(__m256i)(index), \
+ (__v8si)(__m256i)(v1), (int)(scale))
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sqrt_pd(__A),
+ (__v2df)__W);
+ }
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sqrt_pd(__A),
+ (__v2df)_mm_setzero_pd());
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sqrt_pd(__A),
+ (__v4df)__W);
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sqrt_pd(__A),
+ (__v4df)_mm256_setzero_pd());
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sqrt_ps(__A),
+ (__v4sf)__W);
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sqrt_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sqrt_ps(__A),
+ (__v8sf)__W);
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sqrt_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
+ }
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sub_pd(__A, __B),
+ (__v2df)__W);
+ }
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_sub_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sub_pd(__A, __B),
+ (__v4df)__W);
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_sub_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sub_ps(__A, __B),
+ (__v4sf)__W);
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_sub_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sub_ps(__A, __B),
+ (__v8sf)__W);
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_sub_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
+ return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
+ (__v4si)__B);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
+ (__v4si)__A);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
+ (__v4si)__I);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
+ (__v4si)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
+ (__v8si) __B);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
+ (__v8si)__A);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
+ (__v8si)__I);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
+ (__v8si)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
+ return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
+ (__v2df)__B);
+ }
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128(__U,
+ (__v2df)_mm_permutex2var_pd(__A, __I, __B),
+ (__v2df)__A);
+ }
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128(__U,
+ (__v2df)_mm_permutex2var_pd(__A, __I, __B),
+ (__v2df)(__m128d)__I);
+ }
+
+ static __inline__ __m128d __DEFAULT_FN_ATTRS128
+ _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
+ return (__m128d)__builtin_ia32_selectpd_128(__U,
+ (__v2df)_mm_permutex2var_pd(__A, __I, __B),
+ (__v2df)_mm_setzero_pd());
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
+ return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
+ (__v4df)__B);
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
+ __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256(__U,
+ (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
+ (__v4df)__A);
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
+ __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256(__U,
+ (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
+ (__v4df)(__m256d)__I);
+ }
+
+ static __inline__ __m256d __DEFAULT_FN_ATTRS256
+ _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
+ __m256d __B) {
+ return (__m256d)__builtin_ia32_selectpd_256(__U,
+ (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
+ (__v4df)_mm256_setzero_pd());
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
+ return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
+ (__v4sf)__B);
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128(__U,
+ (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
+ (__v4sf)__A);
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128(__U,
+ (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
+ (__v4sf)(__m128)__I);
+ }
+
+ static __inline__ __m128 __DEFAULT_FN_ATTRS128
+ _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
+ return (__m128)__builtin_ia32_selectps_128(__U,
+ (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
+ (__v4sf)_mm_setzero_ps());
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
+ return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
+ (__v8sf) __B);
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256(__U,
+ (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
+ (__v8sf)__A);
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
+ __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256(__U,
+ (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
+ (__v8sf)(__m256)__I);
+ }
+
+ static __inline__ __m256 __DEFAULT_FN_ATTRS256
+ _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
+ __m256 __B) {
+ return (__m256)__builtin_ia32_selectps_256(__U,
+ (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
+ (__v8sf)_mm256_setzero_ps());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
+ return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
+ (__v2di)__B);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
+ (__v2di)__A);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
+ (__v2di)__I);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
+ __m128i __B) {
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
+ (__v2di)_mm_setzero_si128());
+ }
+
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
+ return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
+ (__v4di) __B);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
+ (__v4di)__A);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
+ (__v4di)__I);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
+ __m256i __B) {
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
+ (__v4di)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi8_epi32(__A),
+ (__v4si)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi8_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi8_epi32(__A),
+ (__v8si)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi8_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi8_epi64(__A),
+ (__v2di)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi8_epi64(__A),
+ (__v2di)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi8_epi64(__A),
+ (__v4di)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi8_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi32_epi64(__X),
+ (__v2di)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi32_epi64(__X),
+ (__v2di)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi32_epi64(__X),
+ (__v4di)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi32_epi64(__X),
+ (__v4di)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi16_epi32(__A),
+ (__v4si)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepi16_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi16_epi32(__A),
+ (__v8si)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepi16_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi16_epi64(__A),
+ (__v2di)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepi16_epi64(__A),
+ (__v2di)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi16_epi64(__A),
+ (__v4di)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepi16_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
+ }
+
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu8_epi32(__A),
+ (__v4si)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu8_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu8_epi32(__A),
+ (__v8si)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu8_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu8_epi64(__A),
+ (__v2di)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu8_epi64(__A),
+ (__v2di)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu8_epi64(__A),
+ (__v4di)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu8_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu32_epi64(__X),
+ (__v2di)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu32_epi64(__X),
+ (__v2di)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu32_epi64(__X),
+ (__v4di)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu32_epi64(__X),
+ (__v4di)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu16_epi32(__A),
+ (__v4si)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_cvtepu16_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu16_epi32(__A),
+ (__v8si)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_cvtepu16_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu16_epi64(__A),
+ (__v2di)__W);
+ }
+
+ static __inline__ __m128i __DEFAULT_FN_ATTRS128
+ _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+ {
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_cvtepu16_epi64(__A),
+ (__v2di)_mm_setzero_si128());
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu16_epi64(__A),
+ (__v4di)__W);
+ }
+
+ static __inline__ __m256i __DEFAULT_FN_ATTRS256
+ _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
+ {
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_cvtepu16_epi64(__A),
+ (__v4di)_mm256_setzero_si256());
+ }
+
+
+#define _mm_rol_epi32(a, b) \
+ ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b)))
+
+#define _mm_mask_rol_epi32(w, u, a, b) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+ (__v4si)_mm_rol_epi32((a), (b)), \
+ (__v4si)(__m128i)(w)))
+
+#define _mm_maskz_rol_epi32(u, a, b) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+ (__v4si)_mm_rol_epi32((a), (b)), \
+ (__v4si)_mm_setzero_si128()))
+
+#define _mm256_rol_epi32(a, b) \
+ ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b)))
+
+#define _mm256_mask_rol_epi32(w, u, a, b) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+ (__v8si)_mm256_rol_epi32((a), (b)), \
+ (__v8si)(__m256i)(w)))
+
+#define _mm256_maskz_rol_epi32(u, a, b) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+ (__v8si)_mm256_rol_epi32((a), (b)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm_rol_epi64(a, b) \
+ ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b)))
+
+#define _mm_mask_rol_epi64(w, u, a, b) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+ (__v2di)_mm_rol_epi64((a), (b)), \
+ (__v2di)(__m128i)(w)))
+
+#define _mm_maskz_rol_epi64(u, a, b) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+ (__v2di)_mm_rol_epi64((a), (b)), \
+ (__v2di)_mm_setzero_si128()))
+
+#define _mm256_rol_epi64(a, b) \
+ ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b)))
+
+#define _mm256_mask_rol_epi64(w, u, a, b) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+ (__v4di)_mm256_rol_epi64((a), (b)), \
+ (__v4di)(__m256i)(w)))
+
+#define _mm256_maskz_rol_epi64(u, a, b) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+ (__v4di)_mm256_rol_epi64((a), (b)), \
+ (__v4di)_mm256_setzero_si256()))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_rolv_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_rolv_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_rolv_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_rolv_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_rolv_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_rolv_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_rolv_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_rolv_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_rolv_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_rolv_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_rolv_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_rolv_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+#define _mm_ror_epi32(a, b) \
+ ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b)))
+
+#define _mm_mask_ror_epi32(w, u, a, b) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+ (__v4si)_mm_ror_epi32((a), (b)), \
+ (__v4si)(__m128i)(w)))
+
+#define _mm_maskz_ror_epi32(u, a, b) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
+ (__v4si)_mm_ror_epi32((a), (b)), \
+ (__v4si)_mm_setzero_si128()))
+
+#define _mm256_ror_epi32(a, b) \
+ ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b)))
+
+#define _mm256_mask_ror_epi32(w, u, a, b) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+ (__v8si)_mm256_ror_epi32((a), (b)), \
+ (__v8si)(__m256i)(w)))
+
+#define _mm256_maskz_ror_epi32(u, a, b) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
+ (__v8si)_mm256_ror_epi32((a), (b)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm_ror_epi64(a, b) \
+ ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b)))
+
+#define _mm_mask_ror_epi64(w, u, a, b) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+ (__v2di)_mm_ror_epi64((a), (b)), \
+ (__v2di)(__m128i)(w)))
+
+#define _mm_maskz_ror_epi64(u, a, b) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
+ (__v2di)_mm_ror_epi64((a), (b)), \
+ (__v2di)_mm_setzero_si128()))
+
+#define _mm256_ror_epi64(a, b) \
+ ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b)))
+
+#define _mm256_mask_ror_epi64(w, u, a, b) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+ (__v4di)_mm256_ror_epi64((a), (b)), \
+ (__v4di)(__m256i)(w)))
+
+#define _mm256_maskz_ror_epi64(u, a, b) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
+ (__v4di)_mm256_ror_epi64((a), (b)), \
+ (__v4di)_mm256_setzero_si256()))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sll_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sll_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sll_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sll_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_slli_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_slli_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_slli_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_slli_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sll_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sll_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sll_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sll_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_slli_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_slli_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_slli_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_slli_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_rorv_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_rorv_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_rorv_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_rorv_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_rorv_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_rorv_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_rorv_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_rorv_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_rorv_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_rorv_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_rorv_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_rorv_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sllv_epi64(__X, __Y),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_sllv_epi64(__X, __Y),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sllv_epi64(__X, __Y),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_sllv_epi64(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sllv_epi32(__X, __Y),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sllv_epi32(__X, __Y),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sllv_epi32(__X, __Y),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sllv_epi32(__X, __Y),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srlv_epi64(__X, __Y),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srlv_epi64(__X, __Y),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srlv_epi64(__X, __Y),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srlv_epi64(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srlv_epi32(__X, __Y),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srlv_epi32(__X, __Y),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srlv_epi32(__X, __Y),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srlv_epi32(__X, __Y),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srl_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srl_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srl_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srl_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srli_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srli_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srli_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srli_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srl_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srl_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srl_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srl_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srli_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srli_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srli_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srli_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srav_epi32(__X, __Y),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srav_epi32(__X, __Y),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srav_epi32(__X, __Y),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srav_epi32(__X, __Y),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srav_epi64(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srav_epi64(__X, __Y),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_srav_epi64(__X, __Y),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srav_epi64(__m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srav_epi64(__X, __Y),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_srav_epi64(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+ (__v4si) __A,
+ (__v4si) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
+ (__v4si) __A,
+ (__v4si) _mm_setzero_si128 ());
+}
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+ (__v8si) __A,
+ (__v8si) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
+ (__v8si) __A,
+ (__v8si) _mm256_setzero_si256 ());
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS128
+_mm_load_epi32 (void const *__P)
+{
+ return *(const __m128i *) __P;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
+ (__v4si) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_load_epi32 (void const *__P)
+{
+ return *(const __m256i *) __P;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
+ (__v8si) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_store_epi32 (void *__P, __m128i __A)
+{
+ *(__m128i *) __P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
+ (__v4si) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS256
+_mm256_store_epi32 (void *__P, __m256i __A)
+{
+ *(__m256i *) __P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
+ (__v8si) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+ (__v2di) __A,
+ (__v2di) __W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
+ (__v2di) __A,
+ (__v2di) _mm_setzero_si128 ());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+ (__v4di) __A,
+ (__v4di) __W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
+ (__v4di) __A,
+ (__v4di) _mm256_setzero_si256 ());
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS128
+_mm_load_epi64 (void const *__P)
+{
+ return *(const __m128i *) __P;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
+ (__v2di) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_load_epi64 (void const *__P)
+{
+ return *(const __m256i *) __P;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
+ (__v4di) __W,
+ (__mmask8)
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_store_epi64 (void *__P, __m128i __A)
+{
+ *(__m128i *) __P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
+ (__v2di) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS256
+_mm256_store_epi64 (void *__P, __m256i __A)
+{
+ *(__m256i *) __P = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
+ (__v4di) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_movedup_pd(__A),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_movedup_pd(__A),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_movedup_pd(__A),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_movedup_pd(__A),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__M,
+ (__v4si) _mm_set1_epi32(__A),
+ (__v4si)__O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__M,
+ (__v4si) _mm_set1_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__M,
+ (__v8si) _mm256_set1_epi32(__A),
+ (__v8si)__O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__M,
+ (__v8si) _mm256_set1_epi32(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
+{
+ return (__m128i) __builtin_ia32_selectq_128(__M,
+ (__v2di) _mm_set1_epi64x(__A),
+ (__v2di) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+ return (__m128i) __builtin_ia32_selectq_128(__M,
+ (__v2di) _mm_set1_epi64x(__A),
+ (__v2di) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
+{
+ return (__m256i) __builtin_ia32_selectq_256(__M,
+ (__v4di) _mm256_set1_epi64x(__A),
+ (__v4di) __O) ;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+ return (__m256i) __builtin_ia32_selectq_256(__M,
+ (__v4di) _mm256_set1_epi64x(__A),
+ (__v4di) _mm256_setzero_si256());
+}
+
+#define _mm_fixupimm_pd(A, B, C, imm) \
+ ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_fixupimm_pd(A, U, B, C, imm) \
+ ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \
+ ((__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), \
+ (__v2di)(__m128i)(C), \
+ (int)(imm), (__mmask8)(U)))
+
+#define _mm256_fixupimm_pd(A, B, C, imm) \
+ ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), \
+ (__v4di)(__m256i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \
+ ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), \
+ (__v4di)(__m256i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \
+ ((__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), \
+ (__v4di)(__m256i)(C), \
+ (int)(imm), (__mmask8)(U)))
+
+#define _mm_fixupimm_ps(A, B, C, imm) \
+ ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_fixupimm_ps(A, U, B, C, imm) \
+ ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \
+ ((__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_fixupimm_ps(A, B, C, imm) \
+ ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), \
+ (__v8si)(__m256i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \
+ ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), \
+ (__v8si)(__m256i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \
+ ((__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), \
+ (__v8si)(__m256i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS128
+_mm_loadu_epi64 (void const *__P)
+{
+ struct __loadu_epi64 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi64*)__P)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadu_epi64 (void const *__P)
+{
+ struct __loadu_epi64 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi64*)__P)->__v;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline __m128i __DEFAULT_FN_ATTRS128
+_mm_loadu_epi32 (void const *__P)
+{
+ struct __loadu_epi32 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi32*)__P)->__v;
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+static __inline __m256i __DEFAULT_FN_ATTRS256
+_mm256_loadu_epi32 (void const *__P)
+{
+ struct __loadu_epi32 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_epi32*)__P)->__v;
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storeapd128_mask ((__v2df *) __P,
+ (__v2df) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+ __builtin_ia32_storeapd256_mask ((__v4df *) __P,
+ (__v4df) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+ __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
+ (__v8sf) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_storeu_epi64 (void *__P, __m128i __A)
+{
+ struct __storeu_epi64 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi64*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
+ (__v2di) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS256
+_mm256_storeu_epi64 (void *__P, __m256i __A)
+{
+ struct __storeu_epi64 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi64*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
+ (__v4di) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_storeu_epi32 (void *__P, __m128i __A)
+{
+ struct __storeu_epi32 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi32*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
+ (__v4si) __A,
+ (__mmask8) __U);
+}
+
+static __inline void __DEFAULT_FN_ATTRS256
+_mm256_storeu_epi32 (void *__P, __m256i __A)
+{
+ struct __storeu_epi32 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_epi32*)__P)->__v = __A;
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
+ (__v8si) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storeupd128_mask ((__v2df *) __P,
+ (__v2df) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+ __builtin_ia32_storeupd256_mask ((__v4df *) __P,
+ (__v4df) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storeups128_mask ((__v4sf *) __P,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+ __builtin_ia32_storeups256_mask ((__v8sf *) __P,
+ (__v8sf) __A,
+ (__mmask8) __U);
+}
+
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_unpackhi_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_unpackhi_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_unpackhi_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_unpackhi_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_unpackhi_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_unpackhi_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_unpackhi_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_unpackhi_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_unpacklo_pd(__A, __B),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_unpacklo_pd(__A, __B),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_unpacklo_pd(__A, __B),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_unpacklo_pd(__A, __B),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_unpacklo_ps(__A, __B),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_unpacklo_ps(__A, __B),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_unpacklo_ps(__A, __B),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_unpacklo_ps(__A, __B),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_rcp14_pd (__m128d __A)
+{
+ return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_rcp14_pd (__m256d __A)
+{
+ return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_rcp14_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_rcp14_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+#define _mm_mask_permute_pd(W, U, X, C) \
+ ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm_permute_pd((X), (C)), \
+ (__v2df)(__m128d)(W)))
+
+#define _mm_maskz_permute_pd(U, X, C) \
+ ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm_permute_pd((X), (C)), \
+ (__v2df)_mm_setzero_pd()))
+
+#define _mm256_mask_permute_pd(W, U, X, C) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_permute_pd((X), (C)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_permute_pd(U, X, C) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_permute_pd((X), (C)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm_mask_permute_ps(W, U, X, C) \
+ ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm_permute_ps((X), (C)), \
+ (__v4sf)(__m128)(W)))
+
+#define _mm_maskz_permute_ps(U, X, C) \
+ ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm_permute_ps((X), (C)), \
+ (__v4sf)_mm_setzero_ps()))
+
+#define _mm256_mask_permute_ps(W, U, X, C) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_permute_ps((X), (C)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_permute_ps(U, X, C) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_permute_ps((X), (C)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_permutevar_pd(__A, __C),
+ (__v2df)__W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
+{
+ return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
+ (__v2df)_mm_permutevar_pd(__A, __C),
+ (__v2df)_mm_setzero_pd());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_permutevar_pd(__A, __C),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_permutevar_pd(__A, __C),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_permutevar_ps(__A, __C),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_permutevar_ps(__A, __C),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_permutevar_ps(__A, __C),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_permutevar_ps(__A, __C),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_test_epi32_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B),
+ _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_test_epi32_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_test_epi64_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B),
+ _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_test_epi64_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_testn_epi32_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B),
+ _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_testn_epi64_mask (__m128i __A, __m128i __B)
+{
+ return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
+_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B),
+ _mm_setzero_si128());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
+{
+ return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
+_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
+ _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_unpackhi_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_unpackhi_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_unpackhi_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_unpackhi_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_unpackhi_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_unpackhi_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_unpackhi_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_unpackhi_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_unpacklo_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_unpacklo_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_unpacklo_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_unpacklo_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_unpacklo_epi64(__A, __B),
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
+ (__v2di)_mm_unpacklo_epi64(__A, __B),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_unpacklo_epi64(__A, __B),
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
+ (__v4di)_mm256_unpacklo_epi64(__A, __B),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sra_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_sra_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sra_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_sra_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srai_epi32(__A, __B),
+ (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
+ (__v4si)_mm_srai_epi32(__A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srai_epi32(__A, __B),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
+ (__v8si)_mm256_srai_epi32(__A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_sra_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_sra_epi64(__A, __B), \
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_sra_epi64(__A, __B), \
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sra_epi64(__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_sra_epi64(__A, __B), \
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_sra_epi64(__A, __B), \
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_srai_epi64(__m128i __A, unsigned int __imm)
+{
+ return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_srai_epi64(__A, __imm), \
+ (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
+{
+ return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
+ (__v2di)_mm_srai_epi64(__A, __imm), \
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srai_epi64(__m256i __A, unsigned int __imm)
+{
+ return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A,
+ unsigned int __imm)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_srai_epi64(__A, __imm), \
+ (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
+ (__v4di)_mm256_srai_epi64(__A, __imm), \
+ (__v4di)_mm256_setzero_si256());
+}
+
+#define _mm_ternarylogic_epi32(A, B, C, imm) \
+ ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \
+ ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
+ ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), \
+ (__v4si)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_ternarylogic_epi32(A, B, C, imm) \
+ ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), \
+ (__v8si)(__m256i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \
+ ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), \
+ (__v8si)(__m256i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
+ ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), \
+ (__v8si)(__m256i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_ternarylogic_epi64(A, B, C, imm) \
+ ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \
+ ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
+ ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), \
+ (__v2di)(__m128i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_ternarylogic_epi64(A, B, C, imm) \
+ ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), \
+ (__v4di)(__m256i)(C), (int)(imm), \
+ (__mmask8)-1))
+
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \
+ ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), \
+ (__v4di)(__m256i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
+ ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), \
+ (__v4di)(__m256i)(C), (int)(imm), \
+ (__mmask8)(U)))
+
+
+
+#define _mm256_shuffle_f32x4(A, B, imm) \
+ ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(imm)))
+
+#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+#define _mm256_shuffle_f64x2(A, B, imm) \
+ ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(imm)))
+
+#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm256_shuffle_i32x4(A, B, imm) \
+ ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), (int)(imm)))
+
+#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
+ (__v8si)(__m256i)(W)))
+
+#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm256_shuffle_i64x2(A, B, imm) \
+ ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), (int)(imm)))
+
+#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
+ (__v4di)(__m256i)(W)))
+
+
+#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
+ (__v4di)_mm256_setzero_si256()))
+
+#define _mm_mask_shuffle_pd(W, U, A, B, M) \
+ ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+ (__v2df)(__m128d)(W)))
+
+#define _mm_maskz_shuffle_pd(U, A, B, M) \
+ ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
+ (__v2df)_mm_shuffle_pd((A), (B), (M)), \
+ (__v2df)_mm_setzero_pd()))
+
+#define _mm256_mask_shuffle_pd(W, U, A, B, M) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_shuffle_pd(U, A, B, M) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm_mask_shuffle_ps(W, U, A, B, M) \
+ ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+ (__v4sf)(__m128)(W)))
+
+#define _mm_maskz_shuffle_ps(U, A, B, M) \
+ ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
+ (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
+ (__v4sf)_mm_setzero_ps()))
+
+#define _mm256_mask_shuffle_ps(W, U, A, B, M) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_shuffle_ps(U, A, B, M) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_rsqrt14_pd (__m128d __A)
+{
+ return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_rsqrt14_pd (__m256d __A)
+{
+ return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_rsqrt14_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_rsqrt14_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_broadcast_f32x4(__m128 __A)
+{
+ return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
+ 0, 1, 2, 3, 0, 1, 2, 3);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+ (__v8sf)_mm256_broadcast_f32x4(__A),
+ (__v8sf)__O);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
+ (__v8sf)_mm256_broadcast_f32x4(__A),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_broadcast_i32x4(__m128i __A)
+{
+ return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
+ 0, 1, 2, 3, 0, 1, 2, 3);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_broadcast_i32x4(__A),
+ (__v8si)__O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_broadcast_i32x4(__A),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m256d)__builtin_ia32_selectpd_256(__M,
+ (__v4df) _mm256_broadcastsd_pd(__A),
+ (__v4df) __O);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+ return (__m256d)__builtin_ia32_selectpd_256(__M,
+ (__v4df) _mm256_broadcastsd_pd(__A),
+ (__v4df) _mm256_setzero_pd());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m128)__builtin_ia32_selectps_128(__M,
+ (__v4sf) _mm_broadcastss_ps(__A),
+ (__v4sf) __O);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+ return (__m128)__builtin_ia32_selectps_128(__M,
+ (__v4sf) _mm_broadcastss_ps(__A),
+ (__v4sf) _mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256(__M,
+ (__v8sf) _mm256_broadcastss_ps(__A),
+ (__v8sf) __O);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256(__M,
+ (__v8sf) _mm256_broadcastss_ps(__A),
+ (__v8sf) _mm256_setzero_ps());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__M,
+ (__v4si) _mm_broadcastd_epi32(__A),
+ (__v4si) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__M,
+ (__v4si) _mm_broadcastd_epi32(__A),
+ (__v4si) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__M,
+ (__v8si) _mm256_broadcastd_epi32(__A),
+ (__v8si) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__M,
+ (__v8si) _mm256_broadcastd_epi32(__A),
+ (__v8si) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__M,
+ (__v2di) _mm_broadcastq_epi64(__A),
+ (__v2di) __O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__M,
+ (__v2di) _mm_broadcastq_epi64(__A),
+ (__v2di) _mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__M,
+ (__v4di) _mm256_broadcastq_epi64(__A),
+ (__v4di) __O);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__M,
+ (__v4di) _mm256_broadcastq_epi64(__A),
+ (__v4di) _mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtsepi32_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtsepi32_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtsepi32_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+ (__v8hi)_mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+ (__v8hi)__O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtsepi32_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtsepi64_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtsepi64_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtsepi64_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+ (__v4si)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+ (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+ (__v4si) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtsepi64_epi32 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+ (__v4si)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+ (__v4si)__O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+ (__v4si) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtsepi64_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtsepi64_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtusepi32_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtusepi32_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtusepi32_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtusepi32_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+ (__v8hi) _mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtusepi64_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtusepi64_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtusepi64_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+ (__v4si)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+ (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+ (__v4si) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtusepi64_epi32 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+ (__v4si)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+ (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+ (__v4si) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtusepi64_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtusepi64_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+ (__v8hi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtepi32_epi8 (__m128i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
+ 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi32_epi8 (__m256i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v8si)__A, __v8qi),
+ (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtepi32_epi16 (__m128i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
+ 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi32_epi16 (__m256i __A)
+{
+ return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtepi64_epi8 (__m128i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi64_epi8 (__m256i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
+ 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+ (__v16qi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+ (__v16qi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtepi64_epi32 (__m128i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+ (__v4si) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+ (__v4si) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi64_epi32 (__m256i __A)
+{
+ return (__m128i)__builtin_convertvector((__v4di)__A, __v4si);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm256_cvtepi64_epi32(__A),
+ (__v4si)__O);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
+ (__v4si)_mm256_cvtepi64_epi32(__A),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_cvtepi64_epi16 (__m128i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3,
+ 3, 3, 3, 3);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+ (__v8hi)__O,
+ __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_cvtepi64_epi16 (__m256i __A)
+{
+ return (__m128i)__builtin_shufflevector(
+ __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
+ 2, 3, 4, 5, 6, 7);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+ (__v8hi) __O, __M);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+ (__v8hi) _mm_setzero_si128 (),
+ __M);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
+}
+
+#define _mm256_extractf32x4_ps(A, imm) \
+ ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+ (int)(imm), \
+ (__v4sf)_mm_undefined_ps(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
+ ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+ (int)(imm), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_extractf32x4_ps(U, A, imm) \
+ ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
+ (int)(imm), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm256_extracti32x4_epi32(A, imm) \
+ ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+ (int)(imm), \
+ (__v4si)_mm_undefined_si128(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+ (int)(imm), \
+ (__v4si)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \
+ ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
+ (int)(imm), \
+ (__v4si)_mm_setzero_si128(), \
+ (__mmask8)(U)))
+
+#define _mm256_insertf32x4(A, B, imm) \
+ ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
+ (__v4sf)(__m128)(B), (int)(imm)))
+
+#define _mm256_mask_insertf32x4(W, U, A, B, imm) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+ (__v8sf)(__m256)(W)))
+
+#define _mm256_maskz_insertf32x4(U, A, B, imm) \
+ ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
+ (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
+ (__v8sf)_mm256_setzero_ps()))
+
+#define _mm256_inserti32x4(A, B, imm) \
+ ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
+ (__v4si)(__m128i)(B), (int)(imm)))
+
+#define _mm256_mask_inserti32x4(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+ (__v8si)(__m256i)(W)))
+
+#define _mm256_maskz_inserti32x4(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm_getmant_pd(A, B, C) \
+ ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_pd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_pd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm256_getmant_pd(A, B, C) \
+ ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_getmant_pd(W, U, A, B, C) \
+ ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_getmant_pd(U, A, B, C) \
+ ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v4df)_mm256_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm_getmant_ps(A, B, C) \
+ ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_ps(W, U, A, B, C) \
+ ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ps(U, A, B, C) \
+ ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm256_getmant_ps(A, B, C) \
+ ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_getmant_ps(W, U, A, B, C) \
+ ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_getmant_ps(U, A, B, C) \
+ ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
+ (int)(((C)<<2) | (B)), \
+ (__v8sf)_mm256_setzero_ps(), \
+ (__mmask8)(U)))
+
+#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
+ ((__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
+ (void const *)(addr), \
+ (__v2di)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
+ ((__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
+ (void const *)(addr), \
+ (__v2di)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
+ ((__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
+ (void const *)(addr), \
+ (__v4di)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
+ ((__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
+ (void const *)(addr), \
+ (__v4di)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
+ ((__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
+ (void const *)(addr), \
+ (__v2di)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
+ ((__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
+ (void const *)(addr), \
+ (__v2di)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
+ ((__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
+ (void const *)(addr), \
+ (__v4di)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
+ ((__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
+ (void const *)(addr), \
+ (__v4di)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
+ ((__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
+ (void const *)(addr), \
+ (__v4si)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
+ ((__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
+ (void const *)(addr), \
+ (__v4si)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
+ ((__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
+ (void const *)(addr), \
+ (__v4si)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
+ ((__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
+ (void const *)(addr), \
+ (__v4si)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
+ ((__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
+ (void const *)(addr), \
+ (__v4si)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
+ ((__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
+ (void const *)(addr), \
+ (__v4si)(__m128i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
+ ((__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
+ (void const *)(addr), \
+ (__v8si)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
+ ((__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
+ (void const *)(addr), \
+ (__v8si)(__m256i)(index), \
+ (__mmask8)(mask), (int)(scale)))
+
+#define _mm256_permutex_pd(X, C) \
+ ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C)))
+
+#define _mm256_mask_permutex_pd(W, U, X, C) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_permutex_pd((X), (C)), \
+ (__v4df)(__m256d)(W)))
+
+#define _mm256_maskz_permutex_pd(U, X, C) \
+ ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
+ (__v4df)_mm256_permutex_pd((X), (C)), \
+ (__v4df)_mm256_setzero_pd()))
+
+#define _mm256_permutex_epi64(X, C) \
+ ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C)))
+
+#define _mm256_mask_permutex_epi64(W, U, X, C) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_permutex_epi64((X), (C)), \
+ (__v4di)(__m256i)(W)))
+
+#define _mm256_maskz_permutex_epi64(U, X, C) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_permutex_epi64((X), (C)), \
+ (__v4di)_mm256_setzero_si256()))
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+{
+ return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+ __m256d __Y)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_permutexvar_pd(__X, __Y),
+ (__v4df)__W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+{
+ return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
+ (__v4df)_mm256_permutexvar_pd(__X, __Y),
+ (__v4df)_mm256_setzero_pd());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_permutexvar_epi64(__X, __Y),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
+ (__v4di)_mm256_permutexvar_epi64(__X, __Y),
+ (__v4di)__W);
+}
+
+#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A))
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_permutexvar_ps(__X, __Y),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_permutexvar_ps(__X, __Y),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+#define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_permutexvar_epi32(__X, __Y),
+ (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
+ (__v8si)_mm256_permutexvar_epi32(__X, __Y),
+ (__v8si)_mm256_setzero_si256());
+}
+
+#define _mm_alignr_epi32(A, B, imm) \
+ ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), (int)(imm)))
+
+#define _mm_mask_alignr_epi32(W, U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+ (__v4si)(__m128i)(W)))
+
+#define _mm_maskz_alignr_epi32(U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
+ (__v4si)_mm_setzero_si128()))
+
+#define _mm256_alignr_epi32(A, B, imm) \
+ ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), (int)(imm)))
+
+#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+ (__v8si)(__m256i)(W)))
+
+#define _mm256_maskz_alignr_epi32(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm_alignr_epi64(A, B, imm) \
+ ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), (int)(imm)))
+
+#define _mm_mask_alignr_epi64(W, U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+ (__v2di)(__m128i)(W)))
+
+#define _mm_maskz_alignr_epi64(U, A, B, imm) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
+ (__v2di)_mm_setzero_si128()))
+
+#define _mm256_alignr_epi64(A, B, imm) \
+ ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), (int)(imm)))
+
+#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+ (__v4di)(__m256i)(W)))
+
+#define _mm256_maskz_alignr_epi64(U, A, B, imm) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
+ (__v4di)_mm256_setzero_si256()))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_movehdup_ps(__A),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_movehdup_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_movehdup_ps(__A),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_movehdup_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_moveldup_ps(__A),
+ (__v4sf)__W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
+ (__v4sf)_mm_moveldup_ps(__A),
+ (__v4sf)_mm_setzero_ps());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_moveldup_ps(__A),
+ (__v8sf)__W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
+ (__v8sf)_mm256_moveldup_ps(__A),
+ (__v8sf)_mm256_setzero_ps());
+}
+
+#define _mm256_mask_shuffle_epi32(W, U, A, I) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shuffle_epi32((A), (I)), \
+ (__v8si)(__m256i)(W)))
+
+#define _mm256_maskz_shuffle_epi32(U, A, I) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shuffle_epi32((A), (I)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm_mask_shuffle_epi32(W, U, A, I) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_shuffle_epi32((A), (I)), \
+ (__v4si)(__m128i)(W)))
+
+#define _mm_maskz_shuffle_epi32(U, A, I) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_shuffle_epi32((A), (I)), \
+ (__v4si)_mm_setzero_si128()))
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+ (__v2df) __A,
+ (__v2df) __W);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
+ (__v2df) __A,
+ (__v2df) _mm_setzero_pd ());
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+ (__v4df) __A,
+ (__v4df) __W);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
+ (__v4df) __A,
+ (__v4df) _mm256_setzero_pd ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+ (__v4sf) __A,
+ (__v4sf) __W);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
+ (__v4sf) __A,
+ (__v4sf) _mm_setzero_ps ());
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+ (__v8sf) __A,
+ (__v8sf) __W);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
+ (__v8sf) __A,
+ (__v8sf) _mm256_setzero_ps ());
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+#define _mm_mask_cvt_roundps_ph(W, U, A, I) \
+ ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+ (__v8hi)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_cvt_roundps_ph(U, A, I) \
+ ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
+ (__v8hi)_mm_setzero_si128(), \
+ (__mmask8)(U)))
+
+#define _mm_mask_cvtps_ph _mm_mask_cvt_roundps_ph
+#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph
+
+#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \
+ ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+ (__v8hi)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_cvt_roundps_ph(U, A, I) \
+ ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
+ (__v8hi)_mm_setzero_si128(), \
+ (__mmask8)(U)))
+
+#define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph
+#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __AVX512VLINTRIN_H */
--- /dev/null
+/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVBMI2INTRIN_H
+#define __AVX512VLVBMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+ (__v8hi) __S,
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
+ (__v8hi) _mm_setzero_si128(),
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+ (__v16qi) __S,
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
+ (__v16qi) _mm_setzero_si128(),
+ __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
+{
+ __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
+ __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
+{
+ __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+ (__v8hi) __S,
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
+ (__v8hi) _mm_setzero_si128(),
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+ (__v16qi) __S,
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
+ (__v16qi) _mm_setzero_si128(),
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+ (__v8hi) __S,
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
+ (__v8hi) _mm_setzero_si128(),
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+ (__v16qi) __S,
+ __U);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
+ (__v16qi) _mm_setzero_si128(),
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+ (__v16hi) __S,
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
+ (__v16hi) _mm256_setzero_si256(),
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+ (__v32qi) __S,
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
+ (__v32qi) _mm256_setzero_si256(),
+ __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
+{
+ __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
+ __U);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
+{
+ __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+ (__v16hi) __S,
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
+ (__v16hi) _mm256_setzero_si256(),
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+ (__v32qi) __S,
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
+{
+ return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
+ (__v32qi) _mm256_setzero_si256(),
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+ (__v16hi) __S,
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
+ (__v16hi) _mm256_setzero_si256(),
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+ (__v32qi) __S,
+ __U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
+ (__v32qi) _mm256_setzero_si256(),
+ __U);
+}
+
+#define _mm256_shldi_epi64(A, B, I) \
+ ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+ (__v4di)(__m256i)(S)))
+
+#define _mm256_maskz_shldi_epi64(U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
+ (__v4di)_mm256_setzero_si256()))
+
+#define _mm_shldi_epi64(A, B, I) \
+ ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shldi_epi64(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+ (__v2di)(__m128i)(S)))
+
+#define _mm_maskz_shldi_epi64(U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_shldi_epi64((A), (B), (I)), \
+ (__v2di)_mm_setzero_si128()))
+
+#define _mm256_shldi_epi32(A, B, I) \
+ ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+ (__v8si)(__m256i)(S)))
+
+#define _mm256_maskz_shldi_epi32(U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm_shldi_epi32(A, B, I) \
+ ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shldi_epi32(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+ (__v4si)(__m128i)(S)))
+
+#define _mm_maskz_shldi_epi32(U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_shldi_epi32((A), (B), (I)), \
+ (__v4si)_mm_setzero_si128()))
+
+#define _mm256_shldi_epi16(A, B, I) \
+ ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
+ (__v16hi)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+ (__v16hi)(__m256i)(S)))
+
+#define _mm256_maskz_shldi_epi16(U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_shldi_epi16(A, B, I) \
+ ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shldi_epi16(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+ (__v8hi)(__m128i)(S)))
+
+#define _mm_maskz_shldi_epi16(U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
+ (__v8hi)_mm_setzero_si128()))
+
+#define _mm256_shrdi_epi64(A, B, I) \
+ ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+ (__v4di)(__m256i)(S)))
+
+#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
+ (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
+ (__v4di)_mm256_setzero_si256()))
+
+#define _mm_shrdi_epi64(A, B, I) \
+ ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+ (__v2di)(__m128i)(S)))
+
+#define _mm_maskz_shrdi_epi64(U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
+ (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
+ (__v2di)_mm_setzero_si128()))
+
+#define _mm256_shrdi_epi32(A, B, I) \
+ ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+ (__v8si)(__m256i)(S)))
+
+#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
+ (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
+ (__v8si)_mm256_setzero_si256()))
+
+#define _mm_shrdi_epi32(A, B, I) \
+ ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+ (__v4si)(__m128i)(S)))
+
+#define _mm_maskz_shrdi_epi32(U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
+ (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
+ (__v4si)_mm_setzero_si128()))
+
+#define _mm256_shrdi_epi16(A, B, I) \
+ ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
+ (__v16hi)(__m256i)(B), (int)(I)))
+
+#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+ (__v16hi)(__m256i)(S)))
+
+#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
+ (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
+ (__v16hi)_mm256_setzero_si256()))
+
+#define _mm_shrdi_epi16(A, B, I) \
+ ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B), (int)(I)))
+
+#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+ (__v8hi)(__m128i)(S)))
+
+#define _mm_maskz_shrdi_epi16(U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
+ (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
+ (__v8hi)_mm_setzero_si128()))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B,
+ (__v4di)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_shldv_epi64(__A, __B, __C),
+ (__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_shldv_epi64(__A, __B, __C),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B,
+ (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_shldv_epi64(__A, __B, __C),
+ (__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_shldv_epi64(__A, __B, __C),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B,
+ (__v8si)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_shldv_epi32(__A, __B, __C),
+ (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_shldv_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B,
+ (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_shldv_epi32(__A, __B, __C),
+ (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_shldv_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B,
+ (__v16hi)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__U,
+ (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
+ (__v16hi)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__U,
+ (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B,
+ (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__U,
+ (__v8hi)_mm_shldv_epi16(__A, __B, __C),
+ (__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__U,
+ (__v8hi)_mm_shldv_epi16(__A, __B, __C),
+ (__v8hi)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B,
+ (__v4di)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
+ (__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectq_256(__U,
+ (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
+ (__v4di)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B,
+ (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_shrdv_epi64(__A, __B, __C),
+ (__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectq_128(__U,
+ (__v2di)_mm_shrdv_epi64(__A, __B, __C),
+ (__v2di)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B,
+ (__v8si)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
+ (__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B,
+ (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_shrdv_epi32(__A, __B, __C),
+ (__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_shrdv_epi32(__A, __B, __C),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B,
+ (__v16hi)__C);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__U,
+ (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
+ (__v16hi)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)__builtin_ia32_selectw_256(__U,
+ (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
+ (__v16hi)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B,
+ (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__U,
+ (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
+ (__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_selectw_128(__U,
+ (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
+ (__v8hi)_mm_setzero_si128());
+}
+
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VLVNNIINTRIN_H
+#define __AVX512VLVNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusd_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusds_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssd_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssds_epi32(S, A, B) \
+ ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusd_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusds_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssd_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssds_epi32(S, A, B) \
+ ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+ (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+ (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+ (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+ (__v8si)__S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_selectd_256(__U,
+ (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
+ (__v8si)_mm256_setzero_si256());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+ (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+ (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+ (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+ (__v4si)__S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_selectd_128(__U,
+ (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
+ (__v4si)_mm_setzero_si128());
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLVP2INTERSECT_H
+#define _AVX512VLVP2INTERSECT_H
+
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \
+ __min_vector_width__(128)))
+
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \
+ __min_vector_width__(256)))
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x i32].
+/// \param __b
+/// A 256-bit vector of [8 x i32]
+/// \param __m0
+/// A pointer point to 8-bit mask
+/// \param __m1
+/// A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
+ __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x i64].
+/// \param __b
+/// A 256-bit vector of [4 x i64]
+/// \param __m0
+/// A pointer point to 8-bit mask
+/// \param __m1
+/// A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS256
+_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
+ __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32].
+/// \param __b
+/// A 128-bit vector of [4 x i32]
+/// \param __m0
+/// A pointer point to 8-bit mask
+/// \param __m1
+/// A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
+ __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x i64].
+/// \param __b
+/// A 128-bit vector of [2 x i64]
+/// \param __m0
+/// A pointer point to 8-bit mask
+/// \param __m1
+/// A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
+ __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VNNIINTRIN_H
+#define __AVX512VNNIINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))
+
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+ (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+ (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+ (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
+ (__v16si)__B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+ (__v16si)__S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_selectd_512(__U,
+ (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
+ (__v16si)_mm512_setzero_si512());
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECT_H
+#define _AVX512VP2INTERSECT_H
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512vp2intersect"), \
+ __min_vector_width__(512)))
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between dwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
+///
+/// \param __a
+/// A 512-bit vector of [16 x i32].
+/// \param __b
+/// A 512-bit vector of [16 x i32]
+/// \param __m0
+/// A pointer point to 16-bit mask
+/// \param __m1
+/// A pointer point to 16-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
+ __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
+}
+
+/// Store, in an even/odd pair of mask registers, the indicators of the
+/// locations of value matches between quadwords in operands __a and __b.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
+///
+/// \param __a
+/// A 512-bit vector of [8 x i64].
+/// \param __b
+/// A 512-bit vector of [8 x i64]
+/// \param __m0
+/// A pointer point to 8-bit mask
+/// \param __m1
+/// A pointer point to 8-bit mask
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
+ __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQINTRIN_H
+#define __AVX512VPOPCNTDQINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
+ return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
+ return (__m512i)__builtin_ia32_selectq_512(
+ (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
+ return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
+ return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
+ return (__m512i)__builtin_ia32_selectd_512(
+ (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
+ return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VPOPCNTDQVLINTRIN_H
+#define __AVX512VPOPCNTDQVLINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi64(__m128i __A) {
+ return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectq_128(
+ (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
+ return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_popcnt_epi32(__m128i __A) {
+ return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
+ return (__m128i)__builtin_ia32_selectd_128(
+ (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
+ return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi64(__m256i __A) {
+ return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectq_256(
+ (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
+ return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_popcnt_epi32(__m256i __A) {
+ return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
+ return (__m256i)__builtin_ia32_selectd_256(
+ (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
+ return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif
--- /dev/null
+/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXINTRIN_H
+#define __AVXINTRIN_H
+
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+/* Unsigned types */
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v32qs __attribute__((__vector_size__(32)));
+
+typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
+typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
+typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
+
+typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
+typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
+typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
+
+#if (__clang_major__ > 15)
+#ifdef __SSE2__
+/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
+
+typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
+#endif
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
+
+/* Arithmetic */
+/// Adds two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the sums of both
+/// operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_add_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)((__v4df)__a+(__v4df)__b);
+}
+
+/// Adds two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the sums of both
+/// operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_add_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)((__v8sf)__a+(__v8sf)__b);
+}
+
+/// Subtracts two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the minuend.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing the subtrahend.
+/// \returns A 256-bit vector of [4 x double] containing the differences between
+/// both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sub_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)((__v4df)__a-(__v4df)__b);
+}
+
+/// Subtracts two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the minuend.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing the subtrahend.
+/// \returns A 256-bit vector of [8 x float] containing the differences between
+/// both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sub_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)((__v8sf)__a-(__v8sf)__b);
+}
+
+/// Adds the even-indexed values and subtracts the odd-indexed values of
+/// two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the left source operand.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the alternating sums
+/// and differences between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_addsub_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Adds the even-indexed values and subtracts the odd-indexed values of
+/// two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the left source operand.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
+/// differences between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_addsub_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// Divides two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the dividend.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing the divisor.
+/// \returns A 256-bit vector of [4 x double] containing the quotients of both
+/// operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_div_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)((__v4df)__a/(__v4df)__b);
+}
+
+/// Divides two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the dividend.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing the divisor.
+/// \returns A 256-bit vector of [8 x float] containing the quotients of both
+/// operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_div_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)((__v8sf)__a/(__v8sf)__b);
+}
+
+/// Compares two 256-bit vectors of [4 x double] and returns the greater
+/// of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the maximum values
+/// between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_max_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Compares two 256-bit vectors of [8 x float] and returns the greater
+/// of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the maximum values
+/// between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_max_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// Compares two 256-bit vectors of [4 x double] and returns the lesser
+/// of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the minimum values
+/// between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_min_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Compares two 256-bit vectors of [8 x float] and returns the lesser
+/// of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the minimum values
+/// between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_min_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// Multiplies two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the operands.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the operands.
+/// \returns A 256-bit vector of [4 x double] containing the products of both
+/// operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_mul_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)((__v4df)__a * (__v4df)__b);
+}
+
+/// Multiplies two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the operands.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the operands.
+/// \returns A 256-bit vector of [8 x float] containing the products of both
+/// operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_mul_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)((__v8sf)__a * (__v8sf)__b);
+}
+
+/// Calculates the square roots of the values in a 256-bit vector of
+/// [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the square roots of the
+/// values in the operand.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_sqrt_pd(__m256d __a)
+{
+ return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
+}
+
+/// Calculates the square roots of the values in a 256-bit vector of
+/// [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the square roots of the
+/// values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_sqrt_ps(__m256 __a)
+{
+ return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
+}
+
+/// Calculates the reciprocal square roots of the values in a 256-bit
+/// vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
+/// roots of the values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rsqrt_ps(__m256 __a)
+{
+ return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
+}
+
+/// Calculates the reciprocals of the values in a 256-bit vector of
+/// [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
+/// values in the operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_rcp_ps(__m256 __a)
+{
+ return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
+}
+
+/// Rounds the values in a 256-bit vector of [4 x double] as specified
+/// by the byte operand. The source values are rounded to integer values and
+/// returned as 64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_round_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [4 x double].
+/// \param M
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used. \n
+/// 1: The PE field is not updated. \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M. \n
+/// 1: Use the current MXCSR setting. \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest. \n
+/// 01: Downward (toward negative infinity). \n
+/// 10: Upward (toward positive infinity). \n
+/// 11: Truncated.
+/// \returns A 256-bit vector of [4 x double] containing the rounded values.
+#define _mm256_round_pd(V, M) \
+ ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
+
+/// Rounds the values stored in a 256-bit vector of [8 x float] as
+/// specified by the byte operand. The source values are rounded to integer
+/// values and returned as floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_round_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [8 x float].
+/// \param M
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used. \n
+/// 1: The PE field is not updated. \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M. \n
+/// 1: Use the current MXCSR setting. \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest. \n
+/// 01: Downward (toward negative infinity). \n
+/// 10: Upward (toward positive infinity). \n
+/// 11: Truncated.
+/// \returns A 256-bit vector of [8 x float] containing the rounded values.
+#define _mm256_round_ps(V, M) \
+ ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
+
+/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
+/// source values are rounded up to integer values and returned as 64-bit
+/// double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_ceil_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
+#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
+
+/// Rounds down the values stored in a 256-bit vector of [4 x double].
+/// The source values are rounded down to integer values and returned as
+/// 64-bit double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_floor_pd(__m256d V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the rounded down
+/// values.
+#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
+
+/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
+/// source values are rounded up to integer values and returned as
+/// floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_ceil_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
+#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
+
+/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
+/// source values are rounded down to integer values and returned as
+/// floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_floor_ps(__m256 V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
+#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
+
+/* Logical */
+/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+/// values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_and_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)((__v4du)__a & (__v4du)__b);
+}
+
+/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+/// values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_and_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)((__v8su)__a & (__v8su)__b);
+}
+
+/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
+/// the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the left source operand. The
+/// one's complement of this value is used in the bitwise AND.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing the right source operand.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
+/// values of the second operand and the one's complement of the first
+/// operand.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_andnot_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)(~(__v4du)__a & (__v4du)__b);
+}
+
+/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
+/// the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the left source operand. The
+/// one's complement of this value is used in the bitwise AND.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing the right source operand.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
+/// values of the second operand and the one's complement of the first
+/// operand.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_andnot_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)(~(__v8su)__a & (__v8su)__b);
+}
+
+/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
+/// values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_or_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)((__v4du)__a | (__v4du)__b);
+}
+
+/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
+/// values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_or_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)((__v8su)__a | (__v8su)__b);
+}
+
+/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
+/// values between both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_xor_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)((__v4du)__a ^ (__v4du)__b);
+}
+
+/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
+/// values between both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_xor_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)((__v8su)__a ^ (__v8su)__b);
+}
+
+/* Horizontal arithmetic */
+/// Horizontally adds the adjacent pairs of values contained in two
+/// 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// The horizontal sums of the values are returned in the even-indexed
+/// elements of a vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// The horizontal sums of the values are returned in the odd-indexed
+/// elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
+/// both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hadd_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in two
+/// 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// The horizontal sums of the values are returned in the elements with
+/// index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// The horizontal sums of the values are returned in the elements with
+/// index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
+/// both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hadd_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in two
+/// 256-bit vectors of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// The horizontal differences between the values are returned in the
+/// even-indexed elements of a vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double] containing one of the source operands.
+/// The horizontal differences between the values are returned in the
+/// odd-indexed elements of a vector of [4 x double].
+/// \returns A 256-bit vector of [4 x double] containing the horizontal
+/// differences of both operands.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_hsub_pd(__m256d __a, __m256d __b)
+{
+ return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in two
+/// 256-bit vectors of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// The horizontal differences between the values are returned in the
+/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float] containing one of the source operands.
+/// The horizontal differences between the values are returned in the
+/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
+/// \returns A 256-bit vector of [8 x float] containing the horizontal
+/// differences of both operands.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_hsub_ps(__m256 __a, __m256 __b)
+{
+ return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/* Vector permutations */
+/// Copies the values in a 128-bit vector of [2 x double] as specified
+/// by the 128-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __c
+/// A 128-bit integer vector operand specifying how the values are to be
+/// copied. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [65]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+static __inline __m128d __DEFAULT_FN_ATTRS128
+_mm_permutevar_pd(__m128d __a, __m128i __c)
+{
+ return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
+}
+
+/// Copies the values in a 256-bit vector of [4 x double] as specified
+/// by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __c
+/// A 256-bit integer vector operand specifying how the values are to be
+/// copied. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [65]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// Bit [129]: \n
+/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// Bit [193]: \n
+/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
+/// returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_permutevar_pd(__m256d __a, __m256i __c)
+{
+ return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
+}
+
+/// Copies the values stored in a 128-bit vector of [4 x float] as
+/// specified by the 128-bit integer vector operand.
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __c
+/// A 128-bit integer vector operand specifying how the values are to be
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [33:32]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [65:64]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [97:96]: \n
+/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+static __inline __m128 __DEFAULT_FN_ATTRS128
+_mm_permutevar_ps(__m128 __a, __m128i __c)
+{
+ return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
+}
+
+/// Copies the values stored in a 256-bit vector of [8 x float] as
+/// specified by the 256-bit integer vector operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __c
+/// A 256-bit integer vector operand specifying how the values are to be
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [33:32]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [65:64]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [97:96]: \n
+/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// Bits [129:128]: \n
+/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// Bits [161:160]: \n
+/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// Bits [193:192]: \n
+/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// Bits [225:224]: \n
+/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
+/// returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_permutevar_ps(__m256 __a, __m256i __c)
+{
+ return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
+}
+
+/// Copies the values in a 128-bit vector of [2 x double] as specified
+/// by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_permute_pd(__m128d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param A
+/// A 128-bit vector of [2 x double].
+/// \param C
+/// An immediate integer operand specifying how the values are to be
+/// copied. \n
+/// Bit [0]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+#define _mm_permute_pd(A, C) \
+ ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
+
+/// Copies the values in a 256-bit vector of [4 x double] as specified by
+/// the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute_pd(__m256d A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
+///
+/// \param A
+/// A 256-bit vector of [4 x double].
+/// \param C
+/// An immediate integer operand specifying how the values are to be
+/// copied. \n
+/// Bit [0]: \n
+/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
+/// vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
+/// returned vector. \n
+/// Bit [1]: \n
+/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
+/// returned vector. \n
+/// Bit [2]: \n
+/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
+/// returned vector. \n
+/// Bit [3]: \n
+/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
+/// returned vector. \n
+/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
+/// returned vector.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_permute_pd(A, C) \
+ ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
+
+/// Copies the values in a 128-bit vector of [4 x float] as specified by
+/// the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_permute_ps(__m128 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param A
+/// A 128-bit vector of [4 x float].
+/// \param C
+/// An immediate integer operand specifying how the values are to be
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [3:2]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [5:4]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [7:6]: \n
+/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+#define _mm_permute_ps(A, C) \
+ ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
+
+/// Copies the values in a 256-bit vector of [8 x float] as specified by
+/// the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute_ps(__m256 A, const int C);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
+///
+/// \param A
+/// A 256-bit vector of [8 x float].
+/// \param C
+/// An immediate integer operand specifying how the values are to be
+/// copied. \n
+/// Bits [1:0]: \n
+/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
+/// returned vector. \n
+/// Bits [3:2]: \n
+/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
+/// returned vector. \n
+/// Bits [5:4]: \n
+/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
+/// returned vector. \n
+/// Bits [7:6]: \n
+/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
+/// returned vector. \n
+/// Bits [1:0]: \n
+/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
+/// returned vector. \n
+/// Bits [3:2]: \n
+/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
+/// returned vector. \n
+/// Bits [5:4]: \n
+/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
+/// returned vector. \n
+/// Bits [7:6]: \n
+/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
+/// returned vector. \n
+/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
+/// returned vector.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_permute_ps(A, C) \
+ ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
+
+/// Permutes 128-bit data values stored in two 256-bit vectors of
+/// [4 x double], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [4 x double].
+/// \param V2
+/// A 256-bit vector of [4 x double.
+/// \param M
+/// An immediate integer operand specifying how the values are to be
+/// permuted. \n
+/// Bits [1:0]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// Bits [5:4]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+/// destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_permute2f128_pd(V1, V2, M) \
+ ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
+ (__v4df)(__m256d)(V2), (int)(M)))
+
+/// Permutes 128-bit data values stored in two 256-bit vectors of
+/// [8 x float], as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [8 x float].
+/// \param V2
+/// A 256-bit vector of [8 x float].
+/// \param M
+/// An immediate integer operand specifying how the values are to be
+/// permuted. \n
+/// Bits [1:0]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// Bits [5:4]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+/// destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_permute2f128_ps(V1, V2, M) \
+ ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
+ (__v8sf)(__m256)(V2), (int)(M)))
+
+/// Permutes 128-bit data values stored in two 256-bit integer vectors,
+/// as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit integer vector.
+/// \param V2
+/// A 256-bit integer vector.
+/// \param M
+/// An immediate integer operand specifying how the values are to be copied.
+/// Bits [1:0]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
+/// destination. \n
+/// Bits [5:4]: \n
+/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
+/// destination. \n
+/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
+/// destination. \n
+/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
+/// destination.
+/// \returns A 256-bit integer vector containing the copied values.
+#define _mm256_permute2f128_si256(V1, V2, M) \
+ ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
+ (__v8si)(__m256i)(V2), (int)(M)))
+
+/* Vector Blend */
+/// Merges 64-bit double-precision data values stored in either of the
+/// two 256-bit vectors of [4 x double], as specified by the immediate
+/// integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [4 x double].
+/// \param V2
+/// A 256-bit vector of [4 x double].
+/// \param M
+/// An immediate integer operand, with mask bits [3:0] specifying how the
+/// values are to be copied. The position of the mask bit corresponds to the
+/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
+/// element in operand \a V1 is copied to the same position in the
+/// destination. When a mask bit is 1, the corresponding 64-bit element in
+/// operand \a V2 is copied to the same position in the destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+#define _mm256_blend_pd(V1, V2, M) \
+ ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
+ (__v4df)(__m256d)(V2), (int)(M)))
+
+/// Merges 32-bit single-precision data values stored in either of the
+/// two 256-bit vectors of [8 x float], as specified by the immediate
+/// integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [8 x float].
+/// \param V2
+/// A 256-bit vector of [8 x float].
+/// \param M
+/// An immediate integer operand, with mask bits [7:0] specifying how the
+/// values are to be copied. The position of the mask bit corresponds to the
+/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
+/// element in operand \a V1 is copied to the same position in the
+/// destination. When a mask bit is 1, the corresponding 32-bit element in
+/// operand \a V2 is copied to the same position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+#define _mm256_blend_ps(V1, V2, M) \
+ ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
+ (__v8sf)(__m256)(V2), (int)(M)))
+
+/// Merges 64-bit double-precision data values stored in either of the
+/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double].
+/// \param __c
+/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
+/// how the values are to be copied. The position of the mask bit corresponds
+/// to the most significant bit of a copied value. When a mask bit is 0, the
+/// corresponding 64-bit element in operand \a __a is copied to the same
+/// position in the destination. When a mask bit is 1, the corresponding
+/// 64-bit element in operand \a __b is copied to the same position in the
+/// destination.
+/// \returns A 256-bit vector of [4 x double] containing the copied values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
+{
+ return (__m256d)__builtin_ia32_blendvpd256(
+ (__v4df)__a, (__v4df)__b, (__v4df)__c);
+}
+
+/// Merges 32-bit single-precision data values stored in either of the
+/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float].
+/// \param __c
+/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
+/// and 31 specifying how the values are to be copied. The position of the
+/// mask bit corresponds to the most significant bit of a copied value. When
+/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
+/// copied to the same position in the destination. When a mask bit is 1, the
+/// corresponding 32-bit element in operand \a __b is copied to the same
+/// position in the destination.
+/// \returns A 256-bit vector of [8 x float] containing the copied values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
+{
+ return (__m256)__builtin_ia32_blendvps256(
+ (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
+}
+
+/* Vector Dot Product */
+/// Computes two dot products in parallel, using the lower and upper
+/// halves of two [8 x float] vectors as input to the two computations, and
+/// returning the two dot products in the lower and upper halves of the
+/// [8 x float] result.
+///
+/// The immediate integer operand controls which input elements will
+/// contribute to the dot product, and where the final results are returned.
+/// In general, for each dot product, the four corresponding elements of the
+/// input vectors are multiplied; the first two and second two products are
+/// summed, then the two sums are added to form the final result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
+///
+/// \param V1
+/// A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param V2
+/// A vector of [8 x float] values, treated as two [4 x float] vectors.
+/// \param M
+/// An immediate integer argument. Bits [7:4] determine which elements of
+/// the input vectors are used, with bit [4] corresponding to the lowest
+/// element and bit [7] corresponding to the highest element of each [4 x
+/// float] subvector. If a bit is set, the corresponding elements from the
+/// two input vectors are used as an input for dot product; otherwise that
+/// input is treated as zero. Bits [3:0] determine which elements of the
+/// result will receive a copy of the final dot product, with bit [0]
+/// corresponding to the lowest element and bit [3] corresponding to the
+/// highest element of each [4 x float] subvector. If a bit is set, the dot
+/// product is returned in the corresponding element; otherwise that element
+/// is set to zero. The bitmask is applied in the same way to each of the
+/// two parallel dot product computations.
+/// \returns A 256-bit vector of [8 x float] containing the two dot products.
+#define _mm256_dp_ps(V1, V2, M) \
+ ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
+ (__v8sf)(__m256)(V2), (M)))
+
+/* Vector shuffle */
+/// Selects 8 float values from the 256-bit operands of [8 x float], as
+/// specified by the immediate value operand.
+///
+/// The four selected elements in each operand are copied to the destination
+/// according to the bits specified in the immediate operand. The selected
+/// elements from the first 256-bit operand are copied to bits [63:0] and
+/// bits [191:128] of the destination, and the selected elements from the
+/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
+/// the destination. For example, if bits [7:0] of the immediate operand
+/// contain a value of 0xFF, the 256-bit destination vector would contain the
+/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
+///
+/// \param a
+/// A 256-bit vector of [8 x float]. The four selected elements in this
+/// operand are copied to bits [63:0] and bits [191:128] in the destination,
+/// according to the bits specified in the immediate operand.
+/// \param b
+/// A 256-bit vector of [8 x float]. The four selected elements in this
+/// operand are copied to bits [127:64] and bits [255:192] in the
+/// destination, according to the bits specified in the immediate operand.
+/// \param mask
+/// An immediate value containing an 8-bit value specifying which elements to
+/// copy from \a a and \a b \n.
+/// Bits [3:0] specify the values copied from operand \a a. \n
+/// Bits [7:4] specify the values copied from operand \a b. \n
+/// The destinations within the 256-bit destination are assigned values as
+/// follows, according to the bit value assignments described below: \n
+/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
+/// destination. \n
+/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
+/// destination. \n
+/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
+/// destination. \n
+/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
+/// the destination. \n
+/// Bit value assignments: \n
+/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
+/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
+/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
+/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
+/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
+#define _mm256_shuffle_ps(a, b, mask) \
+ ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
+ (__v8sf)(__m256)(b), (int)(mask)))
+
+/// Selects four double-precision values from the 256-bit operands of
+/// [4 x double], as specified by the immediate value operand.
+///
+/// The selected elements from the first 256-bit operand are copied to bits
+/// [63:0] and bits [191:128] in the destination, and the selected elements
+/// from the second 256-bit operand are copied to bits [127:64] and bits
+/// [255:192] in the destination. For example, if bits [3:0] of the immediate
+/// operand contain a value of 0xF, the 256-bit destination vector would
+/// contain the following values: b[3], a[3], b[1], a[1].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
+///
+/// \param a
+/// A 256-bit vector of [4 x double].
+/// \param b
+/// A 256-bit vector of [4 x double].
+/// \param mask
+/// An immediate value containing 8-bit values specifying which elements to
+/// copy from \a a and \a b: \n
+/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
+/// destination. \n
+/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
+/// destination. \n
+/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
+/// destination. \n
+/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
+/// destination. \n
+/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
+/// destination. \n
+/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
+/// destination. \n
+/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
+/// destination. \n
+/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
+/// destination.
+/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
+#define _mm256_shuffle_pd(a, b, mask) \
+ ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
+ (__v4df)(__m256d)(b), (int)(mask)))
+
+/* Compare */
+#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
+#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
+#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
+#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
+#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
+#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
+#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
+#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
+#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
+#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
+#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
+#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
+#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
+#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
+#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
+#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
+#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
+#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
+#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
+#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
+#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
+#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
+#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
+#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
+#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
+#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
+#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
+#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
+#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
+#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
+#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
+#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
+
+/// Compares each of the corresponding double-precision values of two
+/// 128-bit vectors of [2 x double], using the operation specified by the
+/// immediate integer operand.
+///
+/// Returns a [2 x double] vector consisting of two doubles corresponding to
+/// the two comparison results: zero if the comparison is false, and all 1's
+/// if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
+///
+/// \param a
+/// A 128-bit vector of [2 x double].
+/// \param b
+/// A 128-bit vector of [2 x double].
+/// \param c
+/// An immediate integer operand, with bits [4:0] specifying which comparison
+/// operation to use: \n
+/// 0x00: Equal (ordered, non-signaling) \n
+/// 0x01: Less-than (ordered, signaling) \n
+/// 0x02: Less-than-or-equal (ordered, signaling) \n
+/// 0x03: Unordered (non-signaling) \n
+/// 0x04: Not-equal (unordered, non-signaling) \n
+/// 0x05: Not-less-than (unordered, signaling) \n
+/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
+/// 0x07: Ordered (non-signaling) \n
+/// 0x08: Equal (unordered, non-signaling) \n
+/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
+/// 0x0A: Not-greater-than (unordered, signaling) \n
+/// 0x0B: False (ordered, non-signaling) \n
+/// 0x0C: Not-equal (ordered, non-signaling) \n
+/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
+/// 0x0E: Greater-than (ordered, signaling) \n
+/// 0x0F: True (unordered, non-signaling) \n
+/// 0x10: Equal (ordered, signaling) \n
+/// 0x11: Less-than (ordered, non-signaling) \n
+/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
+/// 0x13: Unordered (signaling) \n
+/// 0x14: Not-equal (unordered, signaling) \n
+/// 0x15: Not-less-than (unordered, non-signaling) \n
+/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+/// 0x17: Ordered (signaling) \n
+/// 0x18: Equal (unordered, signaling) \n
+/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+/// 0x1A: Not-greater-than (unordered, non-signaling) \n
+/// 0x1B: False (ordered, signaling) \n
+/// 0x1C: Not-equal (ordered, signaling) \n
+/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+/// 0x1E: Greater-than (ordered, non-signaling) \n
+/// 0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_pd(a, b, c) \
+ ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
+ (__v2df)(__m128d)(b), (c)))
+
+/// Compares each of the corresponding values of two 128-bit vectors of
+/// [4 x float], using the operation specified by the immediate integer
+/// operand.
+///
+/// Returns a [4 x float] vector consisting of four floats corresponding to
+/// the four comparison results: zero if the comparison is false, and all 1's
+/// if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
+///
+/// \param a
+/// A 128-bit vector of [4 x float].
+/// \param b
+/// A 128-bit vector of [4 x float].
+/// \param c
+/// An immediate integer operand, with bits [4:0] specifying which comparison
+/// operation to use: \n
+/// 0x00: Equal (ordered, non-signaling) \n
+/// 0x01: Less-than (ordered, signaling) \n
+/// 0x02: Less-than-or-equal (ordered, signaling) \n
+/// 0x03: Unordered (non-signaling) \n
+/// 0x04: Not-equal (unordered, non-signaling) \n
+/// 0x05: Not-less-than (unordered, signaling) \n
+/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
+/// 0x07: Ordered (non-signaling) \n
+/// 0x08: Equal (unordered, non-signaling) \n
+/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
+/// 0x0A: Not-greater-than (unordered, signaling) \n
+/// 0x0B: False (ordered, non-signaling) \n
+/// 0x0C: Not-equal (ordered, non-signaling) \n
+/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
+/// 0x0E: Greater-than (ordered, signaling) \n
+/// 0x0F: True (unordered, non-signaling) \n
+/// 0x10: Equal (ordered, signaling) \n
+/// 0x11: Less-than (ordered, non-signaling) \n
+/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
+/// 0x13: Unordered (signaling) \n
+/// 0x14: Not-equal (unordered, signaling) \n
+/// 0x15: Not-less-than (unordered, non-signaling) \n
+/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+/// 0x17: Ordered (signaling) \n
+/// 0x18: Equal (unordered, signaling) \n
+/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+/// 0x1A: Not-greater-than (unordered, non-signaling) \n
+/// 0x1B: False (ordered, signaling) \n
+/// 0x1C: Not-equal (ordered, signaling) \n
+/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+/// 0x1E: Greater-than (ordered, non-signaling) \n
+/// 0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ps(a, b, c) \
+ ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
+ (__v4sf)(__m128)(b), (c)))
+
+/// Compares each of the corresponding double-precision values of two
+/// 256-bit vectors of [4 x double], using the operation specified by the
+/// immediate integer operand.
+///
+/// Returns a [4 x double] vector consisting of four doubles corresponding to
+/// the four comparison results: zero if the comparison is false, and all 1's
+/// if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
+///
+/// \param a
+/// A 256-bit vector of [4 x double].
+/// \param b
+/// A 256-bit vector of [4 x double].
+/// \param c
+/// An immediate integer operand, with bits [4:0] specifying which comparison
+/// operation to use: \n
+/// 0x00: Equal (ordered, non-signaling) \n
+/// 0x01: Less-than (ordered, signaling) \n
+/// 0x02: Less-than-or-equal (ordered, signaling) \n
+/// 0x03: Unordered (non-signaling) \n
+/// 0x04: Not-equal (unordered, non-signaling) \n
+/// 0x05: Not-less-than (unordered, signaling) \n
+/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
+/// 0x07: Ordered (non-signaling) \n
+/// 0x08: Equal (unordered, non-signaling) \n
+/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
+/// 0x0A: Not-greater-than (unordered, signaling) \n
+/// 0x0B: False (ordered, non-signaling) \n
+/// 0x0C: Not-equal (ordered, non-signaling) \n
+/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
+/// 0x0E: Greater-than (ordered, signaling) \n
+/// 0x0F: True (unordered, non-signaling) \n
+/// 0x10: Equal (ordered, signaling) \n
+/// 0x11: Less-than (ordered, non-signaling) \n
+/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
+/// 0x13: Unordered (signaling) \n
+/// 0x14: Not-equal (unordered, signaling) \n
+/// 0x15: Not-less-than (unordered, non-signaling) \n
+/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+/// 0x17: Ordered (signaling) \n
+/// 0x18: Equal (unordered, signaling) \n
+/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+/// 0x1A: Not-greater-than (unordered, non-signaling) \n
+/// 0x1B: False (ordered, signaling) \n
+/// 0x1C: Not-equal (ordered, signaling) \n
+/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+/// 0x1E: Greater-than (ordered, non-signaling) \n
+/// 0x1F: True (unordered, signaling)
+/// \returns A 256-bit vector of [4 x double] containing the comparison results.
+#define _mm256_cmp_pd(a, b, c) \
+ ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
+ (__v4df)(__m256d)(b), (c)))
+
+/// Compares each of the corresponding values of two 256-bit vectors of
+/// [8 x float], using the operation specified by the immediate integer
+/// operand.
+///
+/// Returns a [8 x float] vector consisting of eight floats corresponding to
+/// the eight comparison results: zero if the comparison is false, and all
+/// 1's if the comparison is true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
+///
+/// \param a
+/// A 256-bit vector of [8 x float].
+/// \param b
+/// A 256-bit vector of [8 x float].
+/// \param c
+/// An immediate integer operand, with bits [4:0] specifying which comparison
+/// operation to use: \n
+/// 0x00: Equal (ordered, non-signaling) \n
+/// 0x01: Less-than (ordered, signaling) \n
+/// 0x02: Less-than-or-equal (ordered, signaling) \n
+/// 0x03: Unordered (non-signaling) \n
+/// 0x04: Not-equal (unordered, non-signaling) \n
+/// 0x05: Not-less-than (unordered, signaling) \n
+/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
+/// 0x07: Ordered (non-signaling) \n
+/// 0x08: Equal (unordered, non-signaling) \n
+/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
+/// 0x0A: Not-greater-than (unordered, signaling) \n
+/// 0x0B: False (ordered, non-signaling) \n
+/// 0x0C: Not-equal (ordered, non-signaling) \n
+/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
+/// 0x0E: Greater-than (ordered, signaling) \n
+/// 0x0F: True (unordered, non-signaling) \n
+/// 0x10: Equal (ordered, signaling) \n
+/// 0x11: Less-than (ordered, non-signaling) \n
+/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
+/// 0x13: Unordered (signaling) \n
+/// 0x14: Not-equal (unordered, signaling) \n
+/// 0x15: Not-less-than (unordered, non-signaling) \n
+/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+/// 0x17: Ordered (signaling) \n
+/// 0x18: Equal (unordered, signaling) \n
+/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+/// 0x1A: Not-greater-than (unordered, non-signaling) \n
+/// 0x1B: False (ordered, signaling) \n
+/// 0x1C: Not-equal (ordered, signaling) \n
+/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+/// 0x1E: Greater-than (ordered, non-signaling) \n
+/// 0x1F: True (unordered, signaling)
+/// \returns A 256-bit vector of [8 x float] containing the comparison results.
+#define _mm256_cmp_ps(a, b, c) \
+ ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
+ (__v8sf)(__m256)(b), (c)))
+
+/// Compares each of the corresponding scalar double-precision values of
+/// two 128-bit vectors of [2 x double], using the operation specified by the
+/// immediate integer operand.
+///
+/// If the result is true, all 64 bits of the destination vector are set;
+/// otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
+///
+/// \param a
+/// A 128-bit vector of [2 x double].
+/// \param b
+/// A 128-bit vector of [2 x double].
+/// \param c
+/// An immediate integer operand, with bits [4:0] specifying which comparison
+/// operation to use: \n
+/// 0x00: Equal (ordered, non-signaling) \n
+/// 0x01: Less-than (ordered, signaling) \n
+/// 0x02: Less-than-or-equal (ordered, signaling) \n
+/// 0x03: Unordered (non-signaling) \n
+/// 0x04: Not-equal (unordered, non-signaling) \n
+/// 0x05: Not-less-than (unordered, signaling) \n
+/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
+/// 0x07: Ordered (non-signaling) \n
+/// 0x08: Equal (unordered, non-signaling) \n
+/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
+/// 0x0A: Not-greater-than (unordered, signaling) \n
+/// 0x0B: False (ordered, non-signaling) \n
+/// 0x0C: Not-equal (ordered, non-signaling) \n
+/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
+/// 0x0E: Greater-than (ordered, signaling) \n
+/// 0x0F: True (unordered, non-signaling) \n
+/// 0x10: Equal (ordered, signaling) \n
+/// 0x11: Less-than (ordered, non-signaling) \n
+/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
+/// 0x13: Unordered (signaling) \n
+/// 0x14: Not-equal (unordered, signaling) \n
+/// 0x15: Not-less-than (unordered, non-signaling) \n
+/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+/// 0x17: Ordered (signaling) \n
+/// 0x18: Equal (unordered, signaling) \n
+/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+/// 0x1A: Not-greater-than (unordered, non-signaling) \n
+/// 0x1B: False (ordered, signaling) \n
+/// 0x1C: Not-equal (ordered, signaling) \n
+/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+/// 0x1E: Greater-than (ordered, non-signaling) \n
+/// 0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [2 x double] containing the comparison results.
+#define _mm_cmp_sd(a, b, c) \
+ ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
+ (__v2df)(__m128d)(b), (c)))
+
+/// Compares each of the corresponding scalar values of two 128-bit
+/// vectors of [4 x float], using the operation specified by the immediate
+/// integer operand.
+///
+/// If the result is true, all 32 bits of the destination vector are set;
+/// otherwise they are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
+///
+/// \param a
+/// A 128-bit vector of [4 x float].
+/// \param b
+/// A 128-bit vector of [4 x float].
+/// \param c
+/// An immediate integer operand, with bits [4:0] specifying which comparison
+/// operation to use: \n
+/// 0x00: Equal (ordered, non-signaling) \n
+/// 0x01: Less-than (ordered, signaling) \n
+/// 0x02: Less-than-or-equal (ordered, signaling) \n
+/// 0x03: Unordered (non-signaling) \n
+/// 0x04: Not-equal (unordered, non-signaling) \n
+/// 0x05: Not-less-than (unordered, signaling) \n
+/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
+/// 0x07: Ordered (non-signaling) \n
+/// 0x08: Equal (unordered, non-signaling) \n
+/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
+/// 0x0A: Not-greater-than (unordered, signaling) \n
+/// 0x0B: False (ordered, non-signaling) \n
+/// 0x0C: Not-equal (ordered, non-signaling) \n
+/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
+/// 0x0E: Greater-than (ordered, signaling) \n
+/// 0x0F: True (unordered, non-signaling) \n
+/// 0x10: Equal (ordered, signaling) \n
+/// 0x11: Less-than (ordered, non-signaling) \n
+/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
+/// 0x13: Unordered (signaling) \n
+/// 0x14: Not-equal (unordered, signaling) \n
+/// 0x15: Not-less-than (unordered, non-signaling) \n
+/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
+/// 0x17: Ordered (signaling) \n
+/// 0x18: Equal (unordered, signaling) \n
+/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
+/// 0x1A: Not-greater-than (unordered, non-signaling) \n
+/// 0x1B: False (ordered, signaling) \n
+/// 0x1C: Not-equal (ordered, signaling) \n
+/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
+/// 0x1E: Greater-than (ordered, non-signaling) \n
+/// 0x1F: True (unordered, signaling)
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+#define _mm_cmp_ss(a, b, c) \
+ ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
+ (__v4sf)(__m128)(b), (c)))
+
+/// Takes a [8 x i32] vector and returns the vector element value
+/// indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x i32].
+/// \param __imm
+/// An immediate integer operand with bits [2:0] determining which vector
+/// element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 32 bits of extended
+/// packed data.
+#define _mm256_extract_epi32(X, N) \
+ ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
+
+/// Takes a [16 x i16] vector and returns the vector element value
+/// indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A 256-bit integer vector of [16 x i16].
+/// \param __imm
+/// An immediate integer operand with bits [3:0] determining which vector
+/// element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
+/// packed data.
+#define _mm256_extract_epi16(X, N) \
+ ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
+ (int)(N)))
+
+/// Takes a [32 x i8] vector and returns the vector element value
+/// indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A 256-bit integer vector of [32 x i8].
+/// \param __imm
+/// An immediate integer operand with bits [4:0] determining which vector
+/// element is extracted and returned.
+/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
+/// packed data.
+#define _mm256_extract_epi8(X, N) \
+ ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
+ (int)(N)))
+
+#ifdef __x86_64__
+/// Takes a [4 x i64] vector and returns the vector element value
+/// indexed by the immediate constant operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A 256-bit integer vector of [4 x i64].
+/// \param __imm
+/// An immediate integer operand with bits [1:0] determining which vector
+/// element is extracted and returned.
+/// \returns A 64-bit integer containing the extracted 64 bits of extended
+/// packed data.
+#define _mm256_extract_epi64(X, N) \
+ ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
+#endif
+
+/// Takes a [8 x i32] vector and replaces the vector element value
+/// indexed by the immediate constant operand by a new value. Returns the
+/// modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A vector of [8 x i32] to be used by the insert operation.
+/// \param __b
+/// An integer value. The replacement value for the insert operation.
+/// \param __imm
+/// An immediate integer specifying the index of the vector element to be
+/// replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
+#define _mm256_insert_epi32(X, I, N) \
+ ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
+ (int)(I), (int)(N)))
+
+
+/// Takes a [16 x i16] vector and replaces the vector element value
+/// indexed by the immediate constant operand with a new value. Returns the
+/// modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A vector of [16 x i16] to be used by the insert operation.
+/// \param __b
+/// An i16 integer value. The replacement value for the insert operation.
+/// \param __imm
+/// An immediate integer specifying the index of the vector element to be
+/// replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
+#define _mm256_insert_epi16(X, I, N) \
+ ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
+ (int)(I), (int)(N)))
+
+/// Takes a [32 x i8] vector and replaces the vector element value
+/// indexed by the immediate constant operand with a new value. Returns the
+/// modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A vector of [32 x i8] to be used by the insert operation.
+/// \param __b
+/// An i8 integer value. The replacement value for the insert operation.
+/// \param __imm
+/// An immediate integer specifying the index of the vector element to be
+/// replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
+#define _mm256_insert_epi8(X, I, N) \
+ ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
+ (int)(I), (int)(N)))
+
+#ifdef __x86_64__
+/// Takes a [4 x i64] vector and replaces the vector element value
+/// indexed by the immediate constant operand with a new value. Returns the
+/// modified vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
+/// instruction.
+///
+/// \param __a
+/// A vector of [4 x i64] to be used by the insert operation.
+/// \param __b
+/// A 64-bit integer value. The replacement value for the insert operation.
+/// \param __imm
+/// An immediate integer specifying the index of the vector element to be
+/// replaced.
+/// \returns A copy of vector \a __a, after replacing its element indexed by
+/// \a __imm with \a __b.
+#define _mm256_insert_epi64(X, I, N) \
+ ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
+ (long long)(I), (int)(N)))
+#endif
+
+/* Conversion */
+/// Converts a vector of [4 x i32] into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [4 x i32].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_pd(__m128i __a)
+{
+ return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
+}
+
+/// Converts a vector of [8 x i32] into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 256-bit vector of [8 x float] containing the converted values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_cvtepi32_ps(__m256i __a)
+{
+ return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
+}
+
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_cvtpd_ps(__m256d __a)
+{
+ return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
+}
+
+/// Converts a vector of [8 x float] into a vector of [8 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvtps_epi32(__m256 __a)
+{
+ return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
+}
+
+/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
+/// x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 256-bit vector of [4 x double] containing the converted values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_cvtps_pd(__m128 __a)
+{
+ return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
+}
+
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+/// x i32], truncating the result by rounding towards zero when it is
+/// inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvttpd_epi32(__m256d __a)
+{
+ return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
+}
+
+/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
+/// x i32]. When a conversion is inexact, the value returned is rounded
+/// according to the rounding control bits in the MXCSR register.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 128-bit integer vector containing the converted values.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_cvtpd_epi32(__m256d __a)
+{
+ return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
+}
+
+/// Converts a vector of [8 x float] into a vector of [8 x i32],
+/// truncating the result by rounding towards zero when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 256-bit integer vector containing the converted values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_cvttps_epi32(__m256 __a)
+{
+ return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
+}
+
+/// Returns the first element of the input vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \returns A 64 bit double containing the first element of the input vector.
+static __inline double __DEFAULT_FN_ATTRS
+_mm256_cvtsd_f64(__m256d __a)
+{
+ return __a[0];
+}
+
+/// Returns the first element of the input vector of [8 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x i32].
+/// \returns A 32 bit integer containing the first element of the input vector.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_cvtsi256_si32(__m256i __a)
+{
+ __v8si __b = (__v8si)__a;
+ return __b[0];
+}
+
+/// Returns the first element of the input vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \returns A 32 bit float containing the first element of the input vector.
+static __inline float __DEFAULT_FN_ATTRS
+_mm256_cvtss_f32(__m256 __a)
+{
+ return __a[0];
+}
+
+/* Vector replicate */
+/// Moves and duplicates odd-indexed values from a 256-bit vector of
+/// [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
+/// the return value. \n
+/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
+/// the return value. \n
+/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
+/// return value. \n
+/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
+/// return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+/// values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_movehdup_ps(__m256 __a)
+{
+ return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
+}
+
+/// Moves and duplicates even-indexed values from a 256-bit vector of
+/// [8 x float] to float values in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
+/// the return value. \n
+/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
+/// the return value. \n
+/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
+/// return value. \n
+/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
+/// return value.
+/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
+/// values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_moveldup_ps(__m256 __a)
+{
+ return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
+}
+
+/// Moves and duplicates double-precision floating point values from a
+/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
+/// vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double]. \n
+/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
+/// return value. \n
+/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
+/// the return value.
+/// \returns A 256-bit vector of [4 x double] containing the moved and
+/// duplicated values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_movedup_pd(__m256d __a)
+{
+ return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
+}
+
+/* Unpack and Interleave */
+/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
+/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [127:64] are written to bits [63:0] of the return value. \n
+/// Bits [255:192] are written to bits [191:128] of the return value. \n
+/// \param __b
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [127:64] are written to bits [127:64] of the return value. \n
+/// Bits [255:192] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpackhi_pd(__m256d __a, __m256d __b)
+{
+ return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
+}
+
+/// Unpacks the even-indexed vector elements from two 256-bit vectors of
+/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [63:0] are written to bits [63:0] of the return value. \n
+/// Bits [191:128] are written to bits [191:128] of the return value.
+/// \param __b
+/// A 256-bit floating-point vector of [4 x double]. \n
+/// Bits [63:0] are written to bits [127:64] of the return value. \n
+/// Bits [191:128] are written to bits [255:192] of the return value. \n
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_unpacklo_pd(__m256d __a, __m256d __b)
+{
+ return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
+}
+
+/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
+/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+/// vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [95:64] are written to bits [31:0] of the return value. \n
+/// Bits [127:96] are written to bits [95:64] of the return value. \n
+/// Bits [223:192] are written to bits [159:128] of the return value. \n
+/// Bits [255:224] are written to bits [223:192] of the return value.
+/// \param __b
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [95:64] are written to bits [63:32] of the return value. \n
+/// Bits [127:96] are written to bits [127:96] of the return value. \n
+/// Bits [223:192] are written to bits [191:160] of the return value. \n
+/// Bits [255:224] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpackhi_ps(__m256 __a, __m256 __b)
+{
+ return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
+}
+
+/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
+/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
+/// vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [31:0] are written to bits [31:0] of the return value. \n
+/// Bits [63:32] are written to bits [95:64] of the return value. \n
+/// Bits [159:128] are written to bits [159:128] of the return value. \n
+/// Bits [191:160] are written to bits [223:192] of the return value.
+/// \param __b
+/// A 256-bit vector of [8 x float]. \n
+/// Bits [31:0] are written to bits [63:32] of the return value. \n
+/// Bits [63:32] are written to bits [127:96] of the return value. \n
+/// Bits [159:128] are written to bits [191:160] of the return value. \n
+/// Bits [191:160] are written to bits [255:224] of the return value.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_unpacklo_ps(__m256 __a, __m256 __b)
+{
+ return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
+}
+
+/* Bit Test */
+/// Given two 128-bit floating-point vectors of [2 x double], perform an
+/// element-by-element comparison of the double-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns the ZF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS128
+_mm_testz_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Given two 128-bit floating-point vectors of [2 x double], perform an
+/// element-by-element comparison of the double-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns the CF flag in the EFLAGS register.
+static __inline int __DEFAULT_FN_ATTRS128
+_mm_testc_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Given two 128-bit floating-point vectors of [2 x double], perform an
+/// element-by-element comparison of the double-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS128
+_mm_testnzc_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Given two 128-bit floating-point vectors of [4 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS128
+_mm_testz_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Given two 128-bit floating-point vectors of [4 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS128
+_mm_testc_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Given two 128-bit floating-point vectors of [4 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS128
+_mm_testnzc_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Given two 256-bit floating-point vectors of [4 x double], perform an
+/// element-by-element comparison of the double-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_pd(__m256d __a, __m256d __b)
+{
+ return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Given two 256-bit floating-point vectors of [4 x double], perform an
+/// element-by-element comparison of the double-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_pd(__m256d __a, __m256d __b)
+{
+ return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Given two 256-bit floating-point vectors of [4 x double], perform an
+/// element-by-element comparison of the double-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of double-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double].
+/// \param __b
+/// A 256-bit vector of [4 x double].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_pd(__m256d __a, __m256d __b)
+{
+ return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
+}
+
+/// Given two 256-bit floating-point vectors of [8 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float].
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_ps(__m256 __a, __m256 __b)
+{
+ return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// Given two 256-bit floating-point vectors of [8 x float], perform an
+/// element-by-element comparison of the single-precision element in the
+/// first source vector and the corresponding element in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float].
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_ps(__m256 __a, __m256 __b)
+{
+ return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// Given two 256-bit floating-point vectors of [8 x float], perform an
+/// element-by-element comparison of the single-precision elements in the
+/// first source vector and the corresponding elements in the second source
+/// vector.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
+/// ZF flag is set to 1. \n
+/// If there is at least one pair of single-precision elements where the
+/// sign-bit of the first element is 0 and the sign-bit of the second element
+/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float].
+/// \param __b
+/// A 256-bit vector of [8 x float].
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_ps(__m256 __a, __m256 __b)
+{
+ return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
+}
+
+/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
+/// of the two source vectors.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of bits where both bits are 1, the ZF flag
+/// is set to 0. Otherwise the ZF flag is set to 1. \n
+/// If there is at least one pair of bits where the bit from the first source
+/// vector is 0 and the bit from the second source vector is 1, the CF flag
+/// is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the ZF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \param __b
+/// A 256-bit integer vector.
+/// \returns the ZF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testz_si256(__m256i __a, __m256i __b)
+{
+ return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
+}
+
+/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
+/// of the two source vectors.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of bits where both bits are 1, the ZF flag
+/// is set to 0. Otherwise the ZF flag is set to 1. \n
+/// If there is at least one pair of bits where the bit from the first source
+/// vector is 0 and the bit from the second source vector is 1, the CF flag
+/// is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns the value of the CF flag.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \param __b
+/// A 256-bit integer vector.
+/// \returns the CF flag.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testc_si256(__m256i __a, __m256i __b)
+{
+ return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
+}
+
+/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
+/// of the two source vectors.
+///
+/// The EFLAGS register is updated as follows: \n
+/// If there is at least one pair of bits where both bits are 1, the ZF flag
+/// is set to 0. Otherwise the ZF flag is set to 1. \n
+/// If there is at least one pair of bits where the bit from the first source
+/// vector is 0 and the bit from the second source vector is 1, the CF flag
+/// is set to 0. Otherwise the CF flag is set to 1. \n
+/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
+/// otherwise it returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \param __b
+/// A 256-bit integer vector.
+/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_testnzc_si256(__m256i __a, __m256i __b)
+{
+ return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
+}
+
+/* Vector extract sign mask */
+/// Extracts the sign bits of double-precision floating point elements
+/// in a 256-bit vector of [4 x double] and writes them to the lower order
+/// bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the double-precision
+/// floating point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [3:0].
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_pd(__m256d __a)
+{
+ return __builtin_ia32_movmskpd256((__v4df)__a);
+}
+
+/// Extracts the sign bits of single-precision floating point elements
+/// in a 256-bit vector of [8 x float] and writes them to the lower order
+/// bits of the return value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
+///
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the single-precision floating
+/// point values with sign bits to be extracted.
+/// \returns The sign bits from the operand, written to bits [7:0].
+static __inline int __DEFAULT_FN_ATTRS
+_mm256_movemask_ps(__m256 __a)
+{
+ return __builtin_ia32_movmskps256((__v8sf)__a);
+}
+
+/* Vector __zero */
+/// Zeroes the contents of all XMM or YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
+static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+_mm256_zeroall(void)
+{
+ __builtin_ia32_vzeroall();
+}
+
+/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
+static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
+_mm256_zeroupper(void)
+{
+ __builtin_ia32_vzeroupper();
+}
+
+/* Vector load with broadcast */
+/// Loads a scalar single-precision floating point value from the
+/// specified address pointed to by \a __a and broadcasts it to the elements
+/// of a [4 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+/// The single-precision floating point value to be broadcast.
+/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
+/// equal to the broadcast value.
+static __inline __m128 __DEFAULT_FN_ATTRS128
+_mm_broadcast_ss(float const *__a)
+{
+ float __f = *__a;
+ return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
+}
+
+/// Loads a scalar double-precision floating point value from the
+/// specified address pointed to by \a __a and broadcasts it to the elements
+/// of a [4 x double] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
+///
+/// \param __a
+/// The double-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
+/// equal to the broadcast value.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_sd(double const *__a)
+{
+ double __d = *__a;
+ return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
+}
+
+/// Loads a scalar single-precision floating point value from the
+/// specified address pointed to by \a __a and broadcasts it to the elements
+/// of a [8 x float] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
+///
+/// \param __a
+/// The single-precision floating point value to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
+/// equal to the broadcast value.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ss(float const *__a)
+{
+ float __f = *__a;
+ return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
+/// Loads the data from a 128-bit vector of [2 x double] from the
+/// specified address pointed to by \a __a and broadcasts it to 128-bit
+/// elements in a 256-bit vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+/// The 128-bit vector of [2 x double] to be broadcast.
+/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
+/// equal to the broadcast value.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_broadcast_pd(__m128d const *__a)
+{
+ __m128d __b = _mm_loadu_pd((const double *)__a);
+ return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
+ 0, 1, 0, 1);
+}
+
+/// Loads the data from a 128-bit vector of [4 x float] from the
+/// specified address pointed to by \a __a and broadcasts it to 128-bit
+/// elements in a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
+///
+/// \param __a
+/// The 128-bit vector of [4 x float] to be broadcast.
+/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
+/// equal to the broadcast value.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_broadcast_ps(__m128 const *__a)
+{
+ __m128 __b = _mm_loadu_ps((const float *)__a);
+ return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
+ 0, 1, 2, 3, 0, 1, 2, 3);
+}
+
+/* SIMD load ops */
+/// Loads 4 double-precision floating point values from a 32-byte aligned
+/// memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location containing
+/// double-precision floating point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_load_pd(double const *__p)
+{
+ return *(const __m256d *)__p;
+}
+
+/// Loads 8 single-precision floating point values from a 32-byte aligned
+/// memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location containing float values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_load_ps(float const *__p)
+{
+ return *(const __m256 *)__p;
+}
+
+/// Loads 4 double-precision floating point values from an unaligned
+/// memory location pointed to by \a __p into a vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location containing double-precision floating
+/// point values.
+/// \returns A 256-bit vector of [4 x double] containing the moved values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu_pd(double const *__p)
+{
+ struct __loadu_pd {
+ __m256d_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pd*)__p)->__v;
+}
+
+/// Loads 8 single-precision floating point values from an unaligned
+/// memory location pointed to by \a __p into a vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location containing single-precision floating
+/// point values.
+/// \returns A 256-bit vector of [8 x float] containing the moved values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu_ps(float const *__p)
+{
+ struct __loadu_ps {
+ __m256_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_ps*)__p)->__v;
+}
+
+/// Loads 256 bits of integer data from a 32-byte aligned memory
+/// location pointed to by \a __p into elements of a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
+/// values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_load_si256(__m256i const *__p)
+{
+ return *__p;
+}
+
+/// Loads 256 bits of integer data from an unaligned memory location
+/// pointed to by \a __p into a 256-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu_si256(__m256i_u const *__p)
+{
+ struct __loadu_si256 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_si256*)__p)->__v;
+}
+
+/// Loads 256 bits of integer data from an unaligned memory location
+/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
+/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
+/// line boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a 256-bit integer vector containing integer values.
+/// \returns A 256-bit integer vector containing the moved values.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_lddqu_si256(__m256i const *__p)
+{
+ return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
+}
+
+/* SIMD store ops */
+/// Stores double-precision floating point values from a 256-bit vector
+/// of [4 x double] to a 32-byte aligned memory location pointed to by
+/// \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location that will receive the
+/// double-precision floaing point values.
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_pd(double *__p, __m256d __a)
+{
+ *(__m256d *)__p = __a;
+}
+
+/// Stores single-precision floating point values from a 256-bit vector
+/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location that will receive the
+/// float values.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_ps(float *__p, __m256 __a)
+{
+ *(__m256 *)__p = __a;
+}
+
+/// Stores double-precision floating point values from a 256-bit vector
+/// of [4 x double] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the double-precision
+/// floating point values.
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_pd(double *__p, __m256d __a)
+{
+ struct __storeu_pd {
+ __m256d_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pd*)__p)->__v = __a;
+}
+
+/// Stores single-precision floating point values from a 256-bit vector
+/// of [8 x float] to an unaligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_ps(float *__p, __m256 __a)
+{
+ struct __storeu_ps {
+ __m256_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_ps*)__p)->__v = __a;
+}
+
+/// Stores integer values from a 256-bit integer vector to a 32-byte
+/// aligned memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
+///
+/// \param __p
+/// A 32-byte aligned pointer to a memory location that will receive the
+/// integer values.
+/// \param __a
+/// A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_store_si256(__m256i *__p, __m256i __a)
+{
+ *__p = __a;
+}
+
+/// Stores integer values from a 256-bit integer vector to an unaligned
+/// memory location pointed to by \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the integer values.
+/// \param __a
+/// A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
+{
+ struct __storeu_si256 {
+ __m256i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_si256*)__p)->__v = __a;
+}
+
+/* Conditional load ops */
+/// Conditionally loads double-precision floating point elements from a
+/// memory location pointed to by \a __p into a 128-bit vector of
+/// [2 x double], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the double-precision
+/// floating point values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each data element represents the mask bits. If a mask bit is zero, the
+/// corresponding value in the memory location is not loaded and the
+/// corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline __m128d __DEFAULT_FN_ATTRS128
+_mm_maskload_pd(double const *__p, __m128i __m)
+{
+ return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
+}
+
+/// Conditionally loads double-precision floating point elements from a
+/// memory location pointed to by \a __p into a 256-bit vector of
+/// [4 x double], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the double-precision
+/// floating point values.
+/// \param __m
+/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
+/// significant bit of each quadword element represents the mask bits. If a
+/// mask bit is zero, the corresponding value in the memory location is not
+/// loaded and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [4 x double] containing the loaded values.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_maskload_pd(double const *__p, __m256i __m)
+{
+ return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
+ (__v4di)__m);
+}
+
+/// Conditionally loads single-precision floating point elements from a
+/// memory location pointed to by \a __p into a 128-bit vector of
+/// [4 x float], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the single-precision
+/// floating point values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each data element represents the mask bits. If a mask bit is zero, the
+/// corresponding value in the memory location is not loaded and the
+/// corresponding field in the return value is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline __m128 __DEFAULT_FN_ATTRS128
+_mm_maskload_ps(float const *__p, __m128i __m)
+{
+ return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
+}
+
+/// Conditionally loads single-precision floating point elements from a
+/// memory location pointed to by \a __p into a 256-bit vector of
+/// [8 x float], depending on the mask bits associated with each data
+/// element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that contains the single-precision
+/// floating point values.
+/// \param __m
+/// A 256-bit integer vector of [8 x dword] containing the mask. The most
+/// significant bit of each dword element represents the mask bits. If a mask
+/// bit is zero, the corresponding value in the memory location is not loaded
+/// and the corresponding field in the return value is set to zero.
+/// \returns A 256-bit vector of [8 x float] containing the loaded values.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_maskload_ps(float const *__p, __m256i __m)
+{
+ return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
+}
+
+/* Conditional store ops */
+/// Moves single-precision floating point values from a 256-bit vector
+/// of [8 x float] to a memory location pointed to by \a __p, according to
+/// the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 256-bit integer vector of [8 x dword] containing the mask. The most
+/// significant bit of each dword element in the mask vector represents the
+/// mask bits. If a mask bit is zero, the corresponding value from vector
+/// \a __a is not stored and the corresponding field in the memory location
+/// pointed to by \a __p is not changed.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
+{
+ __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
+}
+
+/// Moves double-precision values from a 128-bit vector of [2 x double]
+/// to a memory location pointed to by \a __p, according to the specified
+/// mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each field in the mask vector represents the mask bits. If a mask bit is
+/// zero, the corresponding value from vector \a __a is not stored and the
+/// corresponding field in the memory location pointed to by \a __p is not
+/// changed.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
+{
+ __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
+}
+
+/// Moves double-precision values from a 256-bit vector of [4 x double]
+/// to a memory location pointed to by \a __p, according to the specified
+/// mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
+/// significant bit of each quadword element in the mask vector represents
+/// the mask bits. If a mask bit is zero, the corresponding value from vector
+/// __a is not stored and the corresponding field in the memory location
+/// pointed to by \a __p is not changed.
+/// \param __a
+/// A 256-bit vector of [4 x double] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
+{
+ __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
+}
+
+/// Moves single-precision floating point values from a 128-bit vector
+/// of [4 x float] to a memory location pointed to by \a __p, according to
+/// the specified mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __m
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each field in the mask vector represents the mask bits. If a mask bit is
+/// zero, the corresponding value from vector __a is not stored and the
+/// corresponding field in the memory location pointed to by \a __p is not
+/// changed.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline void __DEFAULT_FN_ATTRS128
+_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
+{
+ __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
+}
+
+/* Cacheability support ops */
+/// Moves integer data from a 256-bit integer vector to a 32-byte
+/// aligned memory location. To minimize caching, the data is flagged as
+/// non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
+///
+/// \param __a
+/// A pointer to a 32-byte aligned memory location that will receive the
+/// integer values.
+/// \param __b
+/// A 256-bit integer vector containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_si256(__m256i *__a, __m256i __b)
+{
+ typedef __v4di __v4di_aligned __attribute__((aligned(32)));
+ __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
+}
+
+/// Moves double-precision values from a 256-bit vector of [4 x double]
+/// to a 32-byte aligned memory location. To minimize caching, the data is
+/// flagged as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
+///
+/// \param __a
+/// A pointer to a 32-byte aligned memory location that will receive the
+/// double-precision floating-point values.
+/// \param __b
+/// A 256-bit vector of [4 x double] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_pd(double *__a, __m256d __b)
+{
+ typedef __v4df __v4df_aligned __attribute__((aligned(32)));
+ __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
+}
+
+/// Moves single-precision floating point values from a 256-bit vector
+/// of [8 x float] to a 32-byte aligned memory location. To minimize
+/// caching, the data is flagged as non-temporal (unlikely to be used again
+/// soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 32-byte aligned memory location that will receive the
+/// single-precision floating point values.
+/// \param __a
+/// A 256-bit vector of [8 x float] containing the values to be moved.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_stream_ps(float *__p, __m256 __a)
+{
+ typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
+ __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
+}
+
+/* Create vectors */
+/// Create a 256-bit vector of [4 x double] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [4 x double] containing undefined values.
+static __inline__ __m256d __DEFAULT_FN_ATTRS
+_mm256_undefined_pd(void)
+{
+ return (__m256d)__builtin_ia32_undef256();
+}
+
+/// Create a 256-bit vector of [8 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit vector of [8 x float] containing undefined values.
+static __inline__ __m256 __DEFAULT_FN_ATTRS
+_mm256_undefined_ps(void)
+{
+ return (__m256)__builtin_ia32_undef256();
+}
+
+/// Create a 256-bit integer vector with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 256-bit integer vector containing undefined values.
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm256_undefined_si256(void)
+{
+ return (__m256i)__builtin_ia32_undef256();
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double]
+/// initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A double-precision floating-point value used to initialize bits [255:192]
+/// of the result.
+/// \param __b
+/// A double-precision floating-point value used to initialize bits [191:128]
+/// of the result.
+/// \param __c
+/// A double-precision floating-point value used to initialize bits [127:64]
+/// of the result.
+/// \param __d
+/// A double-precision floating-point value used to initialize bits [63:0]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_pd(double __a, double __b, double __c, double __d)
+{
+ return __extension__ (__m256d){ __d, __c, __b, __a };
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float] initialized
+/// with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __a
+/// A single-precision floating-point value used to initialize bits [255:224]
+/// of the result.
+/// \param __b
+/// A single-precision floating-point value used to initialize bits [223:192]
+/// of the result.
+/// \param __c
+/// A single-precision floating-point value used to initialize bits [191:160]
+/// of the result.
+/// \param __d
+/// A single-precision floating-point value used to initialize bits [159:128]
+/// of the result.
+/// \param __e
+/// A single-precision floating-point value used to initialize bits [127:96]
+/// of the result.
+/// \param __f
+/// A single-precision floating-point value used to initialize bits [95:64]
+/// of the result.
+/// \param __g
+/// A single-precision floating-point value used to initialize bits [63:32]
+/// of the result.
+/// \param __h
+/// A single-precision floating-point value used to initialize bits [31:0]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_ps(float __a, float __b, float __c, float __d,
+ float __e, float __f, float __g, float __h)
+{
+ return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
+}
+
+/// Constructs a 256-bit integer vector initialized with the specified
+/// 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i0
+/// A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \param __i1
+/// A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i2
+/// A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i3
+/// A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i4
+/// A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i5
+/// A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i6
+/// A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i7
+/// A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
+ int __i4, int __i5, int __i6, int __i7)
+{
+ return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
+}
+
+/// Constructs a 256-bit integer vector initialized with the specified
+/// 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w15
+/// A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \param __w14
+/// A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w13
+/// A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w12
+/// A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w11
+/// A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w10
+/// A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w09
+/// A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w08
+/// A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w07
+/// A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w06
+/// A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w05
+/// A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w04
+/// A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w03
+/// A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w02
+/// A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w01
+/// A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w00
+/// A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
+ short __w11, short __w10, short __w09, short __w08,
+ short __w07, short __w06, short __w05, short __w04,
+ short __w03, short __w02, short __w01, short __w00)
+{
+ return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
+ __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
+}
+
+/// Constructs a 256-bit integer vector initialized with the specified
+/// 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b31
+/// An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \param __b30
+/// An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b29
+/// An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b28
+/// An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b27
+/// An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b26
+/// An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b25
+/// An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b24
+/// An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b23
+/// An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b22
+/// An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b21
+/// An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b20
+/// An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b19
+/// An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b18
+/// An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b17
+/// An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b16
+/// An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b15
+/// An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b14
+/// An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b13
+/// An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b12
+/// An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b11
+/// An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b10
+/// An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b09
+/// An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b08
+/// An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b07
+/// An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b06
+/// An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b05
+/// An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b04
+/// An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b03
+/// An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b02
+/// An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b01
+/// An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b00
+/// An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
+ char __b27, char __b26, char __b25, char __b24,
+ char __b23, char __b22, char __b21, char __b20,
+ char __b19, char __b18, char __b17, char __b16,
+ char __b15, char __b14, char __b13, char __b12,
+ char __b11, char __b10, char __b09, char __b08,
+ char __b07, char __b06, char __b05, char __b04,
+ char __b03, char __b02, char __b01, char __b00)
+{
+ return __extension__ (__m256i)(__v32qi){
+ __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+ __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+ __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+ __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
+ };
+}
+
+/// Constructs a 256-bit integer vector initialized with the specified
+/// 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \param __b
+/// A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __c
+/// A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __d
+/// A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+ return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
+}
+
+/* Create vectors with elements in reverse order */
+/// Constructs a 256-bit floating-point vector of [4 x double],
+/// initialized in reverse order with the specified double-precision
+/// floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A double-precision floating-point value used to initialize bits [63:0]
+/// of the result.
+/// \param __b
+/// A double-precision floating-point value used to initialize bits [127:64]
+/// of the result.
+/// \param __c
+/// A double-precision floating-point value used to initialize bits [191:128]
+/// of the result.
+/// \param __d
+/// A double-precision floating-point value used to initialize bits [255:192]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_pd(double __a, double __b, double __c, double __d)
+{
+ return _mm256_set_pd(__d, __c, __b, __a);
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float],
+/// initialized in reverse order with the specified single-precision
+/// float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __a
+/// A single-precision floating-point value used to initialize bits [31:0]
+/// of the result.
+/// \param __b
+/// A single-precision floating-point value used to initialize bits [63:32]
+/// of the result.
+/// \param __c
+/// A single-precision floating-point value used to initialize bits [95:64]
+/// of the result.
+/// \param __d
+/// A single-precision floating-point value used to initialize bits [127:96]
+/// of the result.
+/// \param __e
+/// A single-precision floating-point value used to initialize bits [159:128]
+/// of the result.
+/// \param __f
+/// A single-precision floating-point value used to initialize bits [191:160]
+/// of the result.
+/// \param __g
+/// A single-precision floating-point value used to initialize bits [223:192]
+/// of the result.
+/// \param __h
+/// A single-precision floating-point value used to initialize bits [255:224]
+/// of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_ps(float __a, float __b, float __c, float __d,
+ float __e, float __f, float __g, float __h)
+{
+ return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
+}
+
+/// Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i0
+/// A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+/// A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+/// A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+/// A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \param __i4
+/// A 32-bit integral value used to initialize bits [159:128] of the result.
+/// \param __i5
+/// A 32-bit integral value used to initialize bits [191:160] of the result.
+/// \param __i6
+/// A 32-bit integral value used to initialize bits [223:192] of the result.
+/// \param __i7
+/// A 32-bit integral value used to initialize bits [255:224] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
+ int __i4, int __i5, int __i6, int __i7)
+{
+ return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
+}
+
+/// Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w15
+/// A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w14
+/// A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w13
+/// A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w12
+/// A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w11
+/// A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w10
+/// A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w09
+/// A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w08
+/// A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \param __w07
+/// A 16-bit integral value used to initialize bits [143:128] of the result.
+/// \param __w06
+/// A 16-bit integral value used to initialize bits [159:144] of the result.
+/// \param __w05
+/// A 16-bit integral value used to initialize bits [175:160] of the result.
+/// \param __w04
+/// A 16-bit integral value used to initialize bits [191:176] of the result.
+/// \param __w03
+/// A 16-bit integral value used to initialize bits [207:192] of the result.
+/// \param __w02
+/// A 16-bit integral value used to initialize bits [223:208] of the result.
+/// \param __w01
+/// A 16-bit integral value used to initialize bits [239:224] of the result.
+/// \param __w00
+/// A 16-bit integral value used to initialize bits [255:240] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
+ short __w11, short __w10, short __w09, short __w08,
+ short __w07, short __w06, short __w05, short __w04,
+ short __w03, short __w02, short __w01, short __w00)
+{
+ return _mm256_set_epi16(__w00, __w01, __w02, __w03,
+ __w04, __w05, __w06, __w07,
+ __w08, __w09, __w10, __w11,
+ __w12, __w13, __w14, __w15);
+}
+
+/// Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b31
+/// An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b30
+/// An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b29
+/// An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b28
+/// An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b27
+/// An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b26
+/// An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b25
+/// An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b24
+/// An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b23
+/// An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b22
+/// An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b21
+/// An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b20
+/// An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b19
+/// An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b18
+/// An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b17
+/// An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b16
+/// An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \param __b15
+/// An 8-bit integral value used to initialize bits [135:128] of the result.
+/// \param __b14
+/// An 8-bit integral value used to initialize bits [143:136] of the result.
+/// \param __b13
+/// An 8-bit integral value used to initialize bits [151:144] of the result.
+/// \param __b12
+/// An 8-bit integral value used to initialize bits [159:152] of the result.
+/// \param __b11
+/// An 8-bit integral value used to initialize bits [167:160] of the result.
+/// \param __b10
+/// An 8-bit integral value used to initialize bits [175:168] of the result.
+/// \param __b09
+/// An 8-bit integral value used to initialize bits [183:176] of the result.
+/// \param __b08
+/// An 8-bit integral value used to initialize bits [191:184] of the result.
+/// \param __b07
+/// An 8-bit integral value used to initialize bits [199:192] of the result.
+/// \param __b06
+/// An 8-bit integral value used to initialize bits [207:200] of the result.
+/// \param __b05
+/// An 8-bit integral value used to initialize bits [215:208] of the result.
+/// \param __b04
+/// An 8-bit integral value used to initialize bits [223:216] of the result.
+/// \param __b03
+/// An 8-bit integral value used to initialize bits [231:224] of the result.
+/// \param __b02
+/// An 8-bit integral value used to initialize bits [239:232] of the result.
+/// \param __b01
+/// An 8-bit integral value used to initialize bits [247:240] of the result.
+/// \param __b00
+/// An 8-bit integral value used to initialize bits [255:248] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
+ char __b27, char __b26, char __b25, char __b24,
+ char __b23, char __b22, char __b21, char __b20,
+ char __b19, char __b18, char __b17, char __b16,
+ char __b15, char __b14, char __b13, char __b12,
+ char __b11, char __b10, char __b09, char __b08,
+ char __b07, char __b06, char __b05, char __b04,
+ char __b03, char __b02, char __b01, char __b00)
+{
+ return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
+ __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
+ __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
+ __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
+}
+
+/// Constructs a 256-bit integer vector, initialized in reverse order
+/// with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __a
+/// A 64-bit integral value used to initialize bits [63:0] of the result.
+/// \param __b
+/// A 64-bit integral value used to initialize bits [127:64] of the result.
+/// \param __c
+/// A 64-bit integral value used to initialize bits [191:128] of the result.
+/// \param __d
+/// A 64-bit integral value used to initialize bits [255:192] of the result.
+/// \returns An initialized 256-bit integer vector.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
+{
+ return _mm256_set_epi64x(__d, __c, __b, __a);
+}
+
+/* Create vectors with repeated elements */
+/// Constructs a 256-bit floating-point vector of [4 x double], with each
+/// of the four double-precision floating-point vector elements set to the
+/// specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 256-bit floating-point vector of [4 x double].
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set1_pd(double __w)
+{
+ return _mm256_set_pd(__w, __w, __w, __w);
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float], with each
+/// of the eight single-precision floating-point vector elements set to the
+/// specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __w
+/// A single-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 256-bit floating-point vector of [8 x float].
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set1_ps(float __w)
+{
+ return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
+}
+
+/// Constructs a 256-bit integer vector of [8 x i32], with each of the
+/// 32-bit integral vector elements set to the specified 32-bit integral
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
+/// instruction.
+///
+/// \param __i
+/// A 32-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [8 x i32].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi32(int __i)
+{
+ return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
+}
+
+/// Constructs a 256-bit integer vector of [16 x i16], with each of the
+/// 16-bit integral vector elements set to the specified 16-bit integral
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __w
+/// A 16-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [16 x i16].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi16(short __w)
+{
+ return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
+ __w, __w, __w, __w, __w, __w, __w, __w);
+}
+
+/// Constructs a 256-bit integer vector of [32 x i8], with each of the
+/// 8-bit integral vector elements set to the specified 8-bit integral value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
+///
+/// \param __b
+/// An 8-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [32 x i8].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi8(char __b)
+{
+ return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
+ __b, __b, __b, __b, __b, __b, __b, __b,
+ __b, __b, __b, __b, __b, __b, __b, __b,
+ __b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+/// Constructs a 256-bit integer vector of [4 x i64], with each of the
+/// 64-bit integral vector elements set to the specified 64-bit integral
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
+///
+/// \param __q
+/// A 64-bit integral value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 256-bit integer vector of [4 x i64].
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set1_epi64x(long long __q)
+{
+ return _mm256_set_epi64x(__q, __q, __q, __q);
+}
+
+/* Create __zeroed vectors */
+/// Constructs a 256-bit floating-point vector of [4 x double] with all
+/// vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setzero_pd(void)
+{
+ return __extension__ (__m256d){ 0, 0, 0, 0 };
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float] with all
+/// vector elements initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setzero_ps(void)
+{
+ return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+/// Constructs a 256-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
+///
+/// \returns A 256-bit integer vector initialized to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setzero_si256(void)
+{
+ return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
+}
+
+/* Cast between vector types */
+/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+/// floating-point vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+/// bitwise pattern as the parameter.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castpd_ps(__m256d __a)
+{
+ return (__m256)__a;
+}
+
+/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+/// parameter.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castpd_si256(__m256d __a)
+{
+ return (__m256i)__a;
+}
+
+/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+/// floating-point vector of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+/// bitwise pattern as the parameter.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castps_pd(__m256 __a)
+{
+ return (__m256d)__a;
+}
+
+/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
+/// \returns A 256-bit integer vector containing the same bitwise pattern as the
+/// parameter.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castps_si256(__m256 __a)
+{
+ return (__m256i)__a;
+}
+
+/// Casts a 256-bit integer vector into a 256-bit floating-point vector
+/// of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the same
+/// bitwise pattern as the parameter.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castsi256_ps(__m256i __a)
+{
+ return (__m256)__a;
+}
+
+/// Casts a 256-bit integer vector into a 256-bit floating-point vector
+/// of [4 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the same
+/// bitwise pattern as the parameter.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castsi256_pd(__m256i __a)
+{
+ return (__m256d)__a;
+}
+
+/// Returns the lower 128 bits of a 256-bit floating-point vector of
+/// [4 x double] as a 128-bit floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the
+/// lower 128 bits of the parameter.
+static __inline __m128d __DEFAULT_FN_ATTRS
+_mm256_castpd256_pd128(__m256d __a)
+{
+ return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
+}
+
+/// Returns the lower 128 bits of a 256-bit floating-point vector of
+/// [8 x float] as a 128-bit floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the
+/// lower 128 bits of the parameter.
+static __inline __m128 __DEFAULT_FN_ATTRS
+_mm256_castps256_ps128(__m256 __a)
+{
+ return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
+}
+
+/// Truncates a 256-bit integer vector into a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 256-bit integer vector.
+/// \returns A 128-bit integer vector containing the lower 128 bits of the
+/// parameter.
+static __inline __m128i __DEFAULT_FN_ATTRS
+_mm256_castsi256_si128(__m256i __a)
+{
+ return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double] from a
+/// 128-bit floating-point vector of [2 x double].
+///
+/// The lower 128 bits contain the value of the source vector. The contents
+/// of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+/// contain the value of the parameter. The contents of the upper 128 bits
+/// are undefined.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_castpd128_pd256(__m128d __a)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float] from a
+/// 128-bit floating-point vector of [4 x float].
+///
+/// The lower 128 bits contain the value of the source vector. The contents
+/// of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+/// contain the value of the parameter. The contents of the upper 128 bits
+/// are undefined.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_castps128_ps256(__m128 __a)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+///
+/// The lower 128 bits contain the value of the source vector. The contents
+/// of the upper 128 bits are undefined.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+/// the parameter. The contents of the upper 128 bits are undefined.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_castsi128_si256(__m128i __a)
+{
+ return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double] from a
+/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
+/// contain the value of the source vector. The upper 128 bits are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
+/// contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_zextpd128_pd256(__m128d __a)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float] from a
+/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
+/// the value of the source vector. The upper 128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
+/// contain the value of the parameter. The upper 128 bits are set to zero.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_zextps128_ps256(__m128 __a)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// Constructs a 256-bit integer vector from a 128-bit integer vector.
+/// The lower 128 bits contain the value of the source vector. The upper
+/// 128 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
+/// the parameter. The upper 128 bits are set to zero.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_zextsi128_si256(__m128i __a)
+{
+ return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
+}
+
+/*
+ Vector insert.
+ We use macros rather than inlines because we only want to accept
+ invocations where the immediate M is a constant expression.
+*/
+/// Constructs a new 256-bit vector of [8 x float] by first duplicating
+/// a 256-bit vector of [8 x float] given in the first parameter, and then
+/// replacing either the upper or the lower 128 bits with the contents of a
+/// 128-bit vector of [4 x float] in the second parameter.
+///
+/// The immediate integer parameter determines between the upper or the lower
+/// 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [8 x float]. This vector is copied to the result
+/// first, and then either the upper or the lower 128 bits of the result will
+/// be replaced by the contents of \a V2.
+/// \param V2
+/// A 128-bit vector of [4 x float]. The contents of this parameter are
+/// written to either the upper or the lower 128 bits of the result depending
+/// on the value of parameter \a M.
+/// \param M
+/// An immediate integer. The least significant bit determines how the values
+/// from the two parameters are interleaved: \n
+/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
+/// result. \n
+/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+/// result.
+/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
+#define _mm256_insertf128_ps(V1, V2, M) \
+ ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
+ (__v4sf)(__m128)(V2), (int)(M)))
+
+/// Constructs a new 256-bit vector of [4 x double] by first duplicating
+/// a 256-bit vector of [4 x double] given in the first parameter, and then
+/// replacing either the upper or the lower 128 bits with the contents of a
+/// 128-bit vector of [2 x double] in the second parameter.
+///
+/// The immediate integer parameter determines between the upper or the lower
+/// 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit vector of [4 x double]. This vector is copied to the result
+/// first, and then either the upper or the lower 128 bits of the result will
+/// be replaced by the contents of \a V2.
+/// \param V2
+/// A 128-bit vector of [2 x double]. The contents of this parameter are
+/// written to either the upper or the lower 128 bits of the result depending
+/// on the value of parameter \a M.
+/// \param M
+/// An immediate integer. The least significant bit determines how the values
+/// from the two parameters are interleaved: \n
+/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
+/// result. \n
+/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+/// result.
+/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
+#define _mm256_insertf128_pd(V1, V2, M) \
+ ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
+ (__v2df)(__m128d)(V2), (int)(M)))
+
+/// Constructs a new 256-bit integer vector by first duplicating a
+/// 256-bit integer vector given in the first parameter, and then replacing
+/// either the upper or the lower 128 bits with the contents of a 128-bit
+/// integer vector in the second parameter.
+///
+/// The immediate integer parameter determines between the upper or the lower
+/// 128 bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param V1
+/// A 256-bit integer vector. This vector is copied to the result first, and
+/// then either the upper or the lower 128 bits of the result will be
+/// replaced by the contents of \a V2.
+/// \param V2
+/// A 128-bit integer vector. The contents of this parameter are written to
+/// either the upper or the lower 128 bits of the result depending on the
+/// value of parameter \a M.
+/// \param M
+/// An immediate integer. The least significant bit determines how the values
+/// from the two parameters are interleaved: \n
+/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
+/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
+/// result. \n
+/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
+/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
+/// result.
+/// \returns A 256-bit integer vector containing the interleaved values.
+#define _mm256_insertf128_si256(V1, V2, M) \
+ ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
+ (__v4si)(__m128i)(V2), (int)(M)))
+
+/*
+ Vector extract.
+ We use macros rather than inlines because we only want to accept
+ invocations where the immediate M is a constant expression.
+*/
+/// Extracts either the upper or the lower 128 bits from a 256-bit vector
+/// of [8 x float], as determined by the immediate integer parameter, and
+/// returns the extracted bits as a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [8 x float].
+/// \param M
+/// An immediate integer. The least significant bit determines which bits are
+/// extracted from the first parameter: \n
+/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+/// result. \n
+/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
+#define _mm256_extractf128_ps(V, M) \
+ ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
+
+/// Extracts either the upper or the lower 128 bits from a 256-bit vector
+/// of [4 x double], as determined by the immediate integer parameter, and
+/// returns the extracted bits as a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+/// A 256-bit vector of [4 x double].
+/// \param M
+/// An immediate integer. The least significant bit determines which bits are
+/// extracted from the first parameter: \n
+/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+/// result. \n
+/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
+#define _mm256_extractf128_pd(V, M) \
+ ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
+
+/// Extracts either the upper or the lower 128 bits from a 256-bit
+/// integer vector, as determined by the immediate integer parameter, and
+/// returns the extracted bits as a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
+///
+/// \param V
+/// A 256-bit integer vector.
+/// \param M
+/// An immediate integer. The least significant bit determines which bits are
+/// extracted from the first parameter: \n
+/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
+/// result. \n
+/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
+/// \returns A 128-bit integer vector containing the extracted bits.
+#define _mm256_extractf128_si256(V, M) \
+ ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
+
+/// Constructs a 256-bit floating-point vector of [8 x float] by
+/// concatenating two 128-bit floating-point vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
+/// 128 bits of the result.
+/// \param __lo
+/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+/// concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_set_m128 (__m128 __hi, __m128 __lo)
+{
+ return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double] by
+/// concatenating two 128-bit floating-point vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
+/// 128 bits of the result.
+/// \param __lo
+/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+/// concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_set_m128d (__m128d __hi, __m128d __lo)
+{
+ return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
+}
+
+/// Constructs a 256-bit integer vector by concatenating two 128-bit
+/// integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __hi
+/// A 128-bit integer vector to be copied to the upper 128 bits of the
+/// result.
+/// \param __lo
+/// A 128-bit integer vector to be copied to the lower 128 bits of the
+/// result.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_set_m128i (__m128i __hi, __m128i __lo)
+{
+ return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
+}
+
+/// Constructs a 256-bit floating-point vector of [8 x float] by
+/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
+/// similar to _mm256_set_m128, but the order of the input parameters is
+/// swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
+/// 128 bits of the result.
+/// \param __hi
+/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+/// concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_setr_m128 (__m128 __lo, __m128 __hi)
+{
+ return _mm256_set_m128(__hi, __lo);
+}
+
+/// Constructs a 256-bit floating-point vector of [4 x double] by
+/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
+/// similar to _mm256_set_m128d, but the order of the input parameters is
+/// swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
+/// 128 bits of the result.
+/// \param __hi
+/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
+/// 128 bits of the result.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+/// concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_setr_m128d (__m128d __lo, __m128d __hi)
+{
+ return (__m256d)_mm256_set_m128d(__hi, __lo);
+}
+
+/// Constructs a 256-bit integer vector by concatenating two 128-bit
+/// integer vectors. This is similar to _mm256_set_m128i, but the order of
+/// the input parameters is swapped.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
+///
+/// \param __lo
+/// A 128-bit integer vector to be copied to the lower 128 bits of the
+/// result.
+/// \param __hi
+/// A 128-bit integer vector to be copied to the upper 128 bits of the
+/// result.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_setr_m128i (__m128i __lo, __m128i __hi)
+{
+ return (__m256i)_mm256_set_m128i(__hi, __lo);
+}
+
+/* SIMD load ops (unaligned) */
+/// Loads two 128-bit floating-point vectors of [4 x float] from
+/// unaligned memory locations and constructs a 256-bit floating-point vector
+/// of [8 x float] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+/// <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location containing 4 consecutive
+/// single-precision floating-point values. These values are to be copied to
+/// bits[255:128] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location containing 4 consecutive
+/// single-precision floating-point values. These values are to be copied to
+/// bits[127:0] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \returns A 256-bit floating-point vector of [8 x float] containing the
+/// concatenated result.
+static __inline __m256 __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
+{
+ return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
+}
+
+/// Loads two 128-bit floating-point vectors of [2 x double] from
+/// unaligned memory locations and constructs a 256-bit floating-point vector
+/// of [4 x double] by concatenating the two 128-bit vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+/// <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location containing two consecutive
+/// double-precision floating-point values. These values are to be copied to
+/// bits[255:128] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location containing two consecutive
+/// double-precision floating-point values. These values are to be copied to
+/// bits[127:0] of the result. The address of the memory location does not
+/// have to be aligned.
+/// \returns A 256-bit floating-point vector of [4 x double] containing the
+/// concatenated result.
+static __inline __m256d __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
+{
+ return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
+}
+
+/// Loads two 128-bit integer vectors from unaligned memory locations and
+/// constructs a 256-bit integer vector by concatenating the two 128-bit
+/// vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to load instructions followed by the
+/// <c> VINSERTF128 </c> instruction.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location containing a 128-bit integer
+/// vector. This vector is to be copied to bits[255:128] of the result. The
+/// address of the memory location does not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location containing a 128-bit integer
+/// vector. This vector is to be copied to bits[127:0] of the result. The
+/// address of the memory location does not have to be aligned.
+/// \returns A 256-bit integer vector containing the concatenated result.
+static __inline __m256i __DEFAULT_FN_ATTRS
+_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
+{
+ return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
+}
+
+/* SIMD store ops (unaligned) */
+/// Stores the upper and lower 128 bits of a 256-bit floating-point
+/// vector of [8 x float] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+/// store instructions.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __a
+/// A 256-bit floating-point vector of [8 x float].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
+{
+ __m128 __v128;
+
+ __v128 = _mm256_castps256_ps128(__a);
+ _mm_storeu_ps(__addr_lo, __v128);
+ __v128 = _mm256_extractf128_ps(__a, 1);
+ _mm_storeu_ps(__addr_hi, __v128);
+}
+
+/// Stores the upper and lower 128 bits of a 256-bit floating-point
+/// vector of [4 x double] into two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+/// store instructions.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __a
+/// A 256-bit floating-point vector of [4 x double].
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
+{
+ __m128d __v128;
+
+ __v128 = _mm256_castpd256_pd128(__a);
+ _mm_storeu_pd(__addr_lo, __v128);
+ __v128 = _mm256_extractf128_pd(__a, 1);
+ _mm_storeu_pd(__addr_hi, __v128);
+}
+
+/// Stores the upper and lower 128 bits of a 256-bit integer vector into
+/// two different unaligned memory locations.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
+/// store instructions.
+///
+/// \param __addr_hi
+/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __addr_lo
+/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
+/// copied to this memory location. The address of this memory location does
+/// not have to be aligned.
+/// \param __a
+/// A 256-bit integer vector.
+static __inline void __DEFAULT_FN_ATTRS
+_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
+{
+ __m128i __v128;
+
+ __v128 = _mm256_castsi256_si128(__a);
+ _mm_storeu_si128(__addr_lo, __v128);
+ __v128 = _mm256_extractf128_si256(__a, 1);
+ _mm_storeu_si128(__addr_hi, __v128);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS128
+
+#endif /* __AVXINTRIN_H */
--- /dev/null
+/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINTRIN_H
+#define __AVXVNNIINTRIN_H
+
+/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
+/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+
+/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 7
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+/// FOR j := 0 to 3
+/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+/// ENDFOR
+/// DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINTRIN_H
--- /dev/null
+/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMI2INTRIN_H
+#define __BMI2INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bzhi_u32(unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_bzhi_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pdep_u32(unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_pdep_si(__X, __Y);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_pext_u32(unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_pext_si(__X, __Y);
+}
+
+#ifdef __x86_64__
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bzhi_u64(unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_bzhi_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pdep_u64(unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_pdep_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_pext_u64(unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_pext_di(__X, __Y);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+ unsigned long long *__P)
+{
+ unsigned __int128 __res = (unsigned __int128) __X * __Y;
+ *__P = (unsigned long long) (__res >> 64);
+ return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+ unsigned long long __res = (unsigned long long) __X * __Y;
+ *__P = (unsigned int) (__res >> 32);
+ return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __BMI2INTRIN_H */
--- /dev/null
+/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __BMIINTRIN_H
+#define __BMIINTRIN_H
+
+/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
+ instruction behaves as BSF on non-BMI targets, there is code that expects
+ to use it as a potentially faster version of BSF. */
+#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+
+#define _tzcnt_u16(a) (__tzcnt_u16((a)))
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+/// An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+/// bits in the operand.
+static __inline__ unsigned short __RELAXED_FN_ATTRS
+__tzcnt_u16(unsigned short __X)
+{
+ return __builtin_ia32_tzcnt_u16(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+/// An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+/// bits in the operand.
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+ return __builtin_ia32_tzcnt_u32(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+/// An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+/// the operand.
+static __inline__ int __RELAXED_FN_ATTRS
+_mm_tzcnt_32(unsigned int __X)
+{
+ return __builtin_ia32_tzcnt_u32(__X);
+}
+
+#define _tzcnt_u32(a) (__tzcnt_u32((a)))
+
+#ifdef __x86_64__
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+/// bits in the operand.
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+ return __builtin_ia32_tzcnt_u64(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+/// the operand.
+static __inline__ long long __RELAXED_FN_ATTRS
+_mm_tzcnt_64(unsigned long long __X)
+{
+ return __builtin_ia32_tzcnt_u64(__X);
+}
+
+#define _tzcnt_u64(a) (__tzcnt_u64((a)))
+
+#endif /* __x86_64__ */
+
+#undef __RELAXED_FN_ATTRS
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__BMI__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+#define _andn_u32(a, b) (__andn_u32((a), (b)))
+
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a) (__blsi_u32((a)))
+
+#define _blsmsk_u32(a) (__blsmsk_u32((a)))
+
+#define _blsr_u32(a) (__blsr_u32((a)))
+
+/// Performs a bitwise AND of the second operand with the one's
+/// complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+/// An unsigned integer containing one of the operands.
+/// \param __Y
+/// An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+/// operand with the one's complement of the first operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__andn_u32(unsigned int __X, unsigned int __Y)
+{
+ return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+/// in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+/// An unsigned integer whose bits are to be extracted.
+/// \param __Y
+/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
+/// specify the index of the least significant bit. Bits [15:8] specify the
+/// number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+/// extracted bits.
+/// \see _bextr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__bextr_u32(unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+/// in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+/// An unsigned integer whose bits are to be extracted.
+/// \param __Y
+/// An unsigned integer used to specify the index of the least significant
+/// bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+/// An unsigned integer used to specify the number of bits to be extracted.
+/// Bits [7:0] specify the number of bits.
+/// \returns An unsigned integer whose least significant bits contain the
+/// extracted bits.
+/// \see __bextr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
+{
+ return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR2 */
+/// Extracts the specified bits from the first operand and returns them
+/// in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+/// An unsigned integer whose bits are to be extracted.
+/// \param __Y
+/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
+/// specify the index of the least significant bit. Bits [15:8] specify the
+/// number of bits to be extracted.
+/// \returns An unsigned integer whose least significant bits contain the
+/// extracted bits.
+/// \see __bextr_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_bextr2_u32(unsigned int __X, unsigned int __Y) {
+ return __builtin_ia32_bextr_u32(__X, __Y);
+}
+
+/// Clears all bits in the source except for the least significant bit
+/// containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+/// An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+/// the source operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsi_u32(unsigned int __X)
+{
+ return __X & -__X;
+}
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+/// including the least significant bit that is set to 1 in the source
+/// operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+/// An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsmsk_u32(unsigned int __X)
+{
+ return __X ^ (__X - 1);
+}
+
+/// Clears the least significant bit that is set to 1 in the source
+/// operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+/// An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+/// operand.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsr_u32(unsigned int __X)
+{
+ return __X & (__X - 1);
+}
+
+#ifdef __x86_64__
+
+#define _andn_u64(a, b) (__andn_u64((a), (b)))
+
+/* _bextr_u64 != __bextr_u64 */
+#define _blsi_u64(a) (__blsi_u64((a)))
+
+#define _blsmsk_u64(a) (__blsmsk_u64((a)))
+
+#define _blsr_u64(a) (__blsr_u64((a)))
+
+/// Performs a bitwise AND of the second operand with the one's
+/// complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+/// An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+/// operand with the one's complement of the first operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return ~__X & __Y;
+}
+
+/* AMD-specified, double-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+/// in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
+/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
+/// the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+/// extracted bits.
+/// \see _bextr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__bextr_u64(unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR */
+/// Extracts the specified bits from the first operand and returns them
+/// in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+/// An unsigned integer used to specify the index of the least significant
+/// bit for the bits to be extracted. Bits [7:0] specify the index.
+/// \param __Z
+/// An unsigned integer used to specify the number of bits to be extracted.
+/// Bits [7:0] specify the number of bits.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+/// extracted bits.
+/// \see __bextr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+ return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+/* Intel-specified, single-leading-underscore version of BEXTR2 */
+/// Extracts the specified bits from the first operand and returns them
+/// in the least significant bits of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose bits are to be extracted.
+/// \param __Y
+/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
+/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
+/// the number of bits to be extracted.
+/// \returns An unsigned 64-bit integer whose least significant bits contain the
+/// extracted bits.
+/// \see __bextr_u64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
+ return __builtin_ia32_bextr_u64(__X, __Y);
+}
+
+/// Clears all bits in the source except for the least significant bit
+/// containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+/// bits from the source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsi_u64(unsigned long long __X)
+{
+ return __X & -__X;
+}
+
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+/// including the least significant bit that is set to 1 in the source
+/// operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer used to create the mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsmsk_u64(unsigned long long __X)
+{
+ return __X ^ (__X - 1);
+}
+
+/// Clears the least significant bit that is set to 1 in the source
+/// operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+/// source operand.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsr_u64(unsigned long long __X)
+{
+ return __X & (__X - 1);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
+ || defined(__BMI__) */
+
+#endif /* __BMIINTRIN_H */
--- /dev/null
+/*===---- cetintrin.h - CET intrinsic --------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CETINTRIN_H
+#define __CETINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
+
+static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
+ __builtin_ia32_incsspd(__a);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
+ __builtin_ia32_incsspq(__a);
+}
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+ __builtin_ia32_incsspq(__a);
+}
+#else /* __x86_64__ */
+static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
+ __builtin_ia32_incsspd((int)__a);
+}
+#endif /* __x86_64__ */
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
+ return __builtin_ia32_rdsspd(__a);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
+ unsigned int t;
+ return __builtin_ia32_rdsspd(t);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
+ return __builtin_ia32_rdsspq(__a);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
+ unsigned long long t;
+ return __builtin_ia32_rdsspq(t);
+}
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
+ return __builtin_ia32_rdsspq(0);
+}
+#else /* __x86_64__ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
+ return __builtin_ia32_rdsspd(0);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
+ __builtin_ia32_saveprevssp();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
+ __builtin_ia32_rstorssp(__p);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
+ __builtin_ia32_wrssd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
+ __builtin_ia32_wrssq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
+ __builtin_ia32_wrussd(__a, __p);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
+ __builtin_ia32_wrussq(__a, __p);
+}
+#endif /* __x86_64__ */
+
+static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
+ __builtin_ia32_setssbsy();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
+ __builtin_ia32_clrssbsy(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CETINTRIN_H */
--- /dev/null
+/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __CLDEMOTEINTRIN_H
+#define __CLDEMOTEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("cldemote")))
+
+/// Hint to hardware that the cache line that contains \p __P should be demoted
+/// from the cache closest to the processor core to a level more distant from
+/// the processor core.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
+static __inline__ void __DEFAULT_FN_ATTRS
+_cldemote(const void * __P) {
+ __builtin_ia32_cldemote(__P);
+}
+
+#define _mm_cldemote(p) _cldemote(p)
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLFLUSHOPTINTRIN_H
+#define __CLFLUSHOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clflushopt(void const * __m) {
+ __builtin_ia32_clflushopt(__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __CLWBINTRIN_H
+#define __CLWBINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb")))
+
+/// Writes back to memory the cache line (if modified) that contains the
+/// linear address specified in \a __p from any level of the cache hierarchy in
+/// the cache coherence domain
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLWB </c> instruction.
+///
+/// \param __p
+/// A pointer to the memory location used to identify the cache line to be
+/// written back.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clwb(void const *__p) {
+ __builtin_ia32_clwb(__p);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===----------------------- clzerointrin.h - CLZERO ----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __CLZEROINTRIN_H
+#define __CLZEROINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("clzero")))
+
+/// Loads the cache line address and zero's out the cacheline
+///
+/// \headerfile <clzerointrin.h>
+///
+/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
+///
+/// \param __line
+/// A pointer to a cacheline which needs to be zeroed out.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_clzero (void * __line)
+{
+ __builtin_ia32_clzero ((void *)__line);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CLZEROINTRIN_H */
--- /dev/null
+/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CRC32INTRIN_H
+#define __CRC32INTRIN_H
+
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+/// unsigned char operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
+///
+/// \param __C
+/// An unsigned integer operand to add to the CRC-32C checksum of operand
+/// \a __D.
+/// \param __D
+/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+/// operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u8(unsigned int __C, unsigned char __D)
+{
+ return __builtin_ia32_crc32qi(__C, __D);
+}
+
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+/// unsigned short operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
+///
+/// \param __C
+/// An unsigned integer operand to add to the CRC-32C checksum of operand
+/// \a __D.
+/// \param __D
+/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+/// operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u16(unsigned int __C, unsigned short __D)
+{
+ return __builtin_ia32_crc32hi(__C, __D);
+}
+
+/// Adds the first unsigned integer operand to the CRC-32C checksum of
+/// the second unsigned integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
+///
+/// \param __C
+/// An unsigned integer operand to add to the CRC-32C checksum of operand
+/// \a __D.
+/// \param __D
+/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+/// operand \a __D.
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_crc32_u32(unsigned int __C, unsigned int __D)
+{
+ return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
+/// unsigned 64-bit integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
+///
+/// \param __C
+/// An unsigned integer operand to add to the CRC-32C checksum of operand
+/// \a __D.
+/// \param __D
+/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+/// \returns The result of adding operand \a __C to the CRC-32C checksum of
+/// operand \a __D.
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
+{
+ return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __CRC32INTRIN_H */
--- /dev/null
+/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __EMMINTRIN_H
+#define __EMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <xmmintrin.h>
+
+typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
+typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
+
+typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
+typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
+
+/* Type defines. */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__((__vector_size__(16)));
+typedef char __v16qi __attribute__((__vector_size__(16)));
+
+/* Unsigned types */
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
+typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
+
+/* We need an explicitly signed variant for char. Note that this shouldn't
+ * appear in the interface though. */
+typedef signed char __v16qs __attribute__((__vector_size__(16)));
+
+#if (__clang_major__ > 15)
+#ifdef __SSE2__
+/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
+
+typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#endif
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
+
+/// Adds lower double-precision values in both operands and returns the
+/// sum in the lower 64 bits of the result. The upper 64 bits of the result
+/// are copied from the upper double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
+/// from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_sd(__m128d __a, __m128d __b)
+{
+ __a[0] += __b[0];
+ return __a;
+}
+
+/// Adds two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the sums of both
+/// operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_add_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)((__v2df)__a + (__v2df)__b);
+}
+
+/// Subtracts the lower double-precision value of the second operand
+/// from the lower double-precision value of the first operand and returns
+/// the difference in the lower 64 bits of the result. The upper 64 bits of
+/// the result are copied from the upper double-precision value of the first
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// difference of the lower 64 bits of both operands. The upper 64 bits are
+/// copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_sd(__m128d __a, __m128d __b)
+{
+ __a[0] -= __b[0];
+ return __a;
+}
+
+/// Subtracts two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the minuend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the subtrahend.
+/// \returns A 128-bit vector of [2 x double] containing the differences between
+/// both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sub_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)((__v2df)__a - (__v2df)__b);
+}
+
+/// Multiplies lower double-precision values in both operands and returns
+/// the product in the lower 64 bits of the result. The upper 64 bits of the
+/// result are copied from the upper double-precision value of the first
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// product of the lower 64 bits of both operands. The upper 64 bits are
+/// copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_sd(__m128d __a, __m128d __b)
+{
+ __a[0] *= __b[0];
+ return __a;
+}
+
+/// Multiplies two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the products of both
+/// operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_mul_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)((__v2df)__a * (__v2df)__b);
+}
+
+/// Divides the lower double-precision value of the first operand by the
+/// lower double-precision value of the second operand and returns the
+/// quotient in the lower 64 bits of the result. The upper 64 bits of the
+/// result are copied from the upper double-precision value of the first
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing divisor.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// quotient of the lower 64 bits of both operands. The upper 64 bits are
+/// copied from the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_sd(__m128d __a, __m128d __b)
+{
+ __a[0] /= __b[0];
+ return __a;
+}
+
+/// Performs an element-by-element division of two 128-bit vectors of
+/// [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the dividend.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the divisor.
+/// \returns A 128-bit vector of [2 x double] containing the quotients of both
+/// operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_div_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)((__v2df)__a / (__v2df)__b);
+}
+
+/// Calculates the square root of the lower double-precision value of
+/// the second operand and returns it in the lower 64 bits of the result.
+/// The upper 64 bits of the result are copied from the upper
+/// double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// upper 64 bits of this operand are copied to the upper 64 bits of the
+/// result.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// square root is calculated using the lower 64 bits of this operand.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// square root of the lower 64 bits of operand \a __b, and whose upper 64
+/// bits are copied from the upper 64 bits of operand \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_sd(__m128d __a, __m128d __b)
+{
+ __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
+ return __extension__ (__m128d) { __c[0], __a[1] };
+}
+
+/// Calculates the square root of the each of two values stored in a
+/// 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the square roots of the
+/// values in the operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_sqrt_pd(__m128d __a)
+{
+ return __builtin_ia32_sqrtpd((__v2df)__a);
+}
+
+/// Compares lower 64-bit double-precision values of both operands, and
+/// returns the lesser of the pair of values in the lower 64-bits of the
+/// result. The upper 64 bits of the result are copied from the upper
+/// double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// minimum value between both operands. The upper 64 bits are copied from
+/// the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Performs element-by-element comparison of the two 128-bit vectors of
+/// [2 x double] and returns the vector containing the lesser of each pair of
+/// values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the minimum values
+/// between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_min_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares lower 64-bit double-precision values of both operands, and
+/// returns the greater of the pair of values in the lower 64-bits of the
+/// result. The upper 64 bits of the result are copied from the upper
+/// double-precision value of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands. The
+/// lower 64 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// maximum value between both operands. The upper 64 bits are copied from
+/// the upper 64 bits of the first source operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Performs element-by-element comparison of the two 128-bit vectors of
+/// [2 x double] and returns the vector containing the greater of each pair
+/// of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the operands.
+/// \returns A 128-bit vector of [2 x double] containing the maximum values
+/// between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_max_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+/// values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_and_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)((__v2du)__a & (__v2du)__b);
+}
+
+/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
+/// the one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the left source operand. The
+/// one's complement of this value is used in the bitwise AND.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
+/// values in the second operand and the one's complement of the first
+/// operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_andnot_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)(~(__v2du)__a & (__v2du)__b);
+}
+
+/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
+/// values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_or_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)((__v2du)__a | (__v2du)__b);
+}
+
+/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
+/// values between both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_xor_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)((__v2du)__a ^ (__v2du)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
+/// for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are less than those in the second operand. Each comparison
+/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are less than or equal to those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are greater than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are greater than or equal to those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are ordered with respect to those in the second operand.
+///
+/// A pair of double-precision values are "ordered" with respect to each
+/// other if neither value is a NaN. Each comparison yields 0x0 for false,
+/// 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are unordered with respect to those in the second operand.
+///
+/// A pair of double-precision values are "unordered" with respect to each
+/// other if one or both values are NaN. Each comparison yields 0x0 for
+/// false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are unequal to those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not less than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not less than or equal to those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not greater than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
+}
+
+/// Compares each of the corresponding double-precision values of the
+/// 128-bit vectors of [2 x double] to determine if the values in the first
+/// operand are not greater than or equal to those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \param __b
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector containing the comparison results.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_pd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] for equality.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpeq_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than the corresponding value in
+/// the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmplt_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmple_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than the corresponding value
+/// in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpgt_sd(__m128d __a, __m128d __b)
+{
+ __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
+ return __extension__ (__m128d) { __c[0], __a[1] };
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpge_sd(__m128d __a, __m128d __b)
+{
+ __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
+ return __extension__ (__m128d) { __c[0], __a[1] };
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is "ordered" with respect to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
+/// of double-precision values are "ordered" with respect to each other if
+/// neither value is a NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpord_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is "unordered" with respect to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
+/// of double-precision values are "unordered" with respect to each other if
+/// one or both values are NaN.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpunord_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is unequal to the corresponding value in
+/// the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpneq_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not less than the corresponding
+/// value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnlt_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not less than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnle_sd(__m128d __a, __m128d __b)
+{
+ return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not greater than the corresponding
+/// value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpngt_sd(__m128d __a, __m128d __b)
+{
+ __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
+ return __extension__ (__m128d) { __c[0], __a[1] };
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is not greater than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns A 128-bit vector. The lower 64 bits contains the comparison
+/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cmpnge_sd(__m128d __a, __m128d __b)
+{
+ __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
+ return __extension__ (__m128d) { __c[0], __a[1] };
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] for equality.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than the corresponding value in
+/// the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than the corresponding value
+/// in the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is unequal to the corresponding value in
+/// the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two
+/// lower double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] for equality. The
+/// comparison yields 0 for false, 1 for true.
+///
+/// If either of the two lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than the corresponding value in
+/// the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two lower
+/// double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is less than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two lower
+/// double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than the corresponding value
+/// in the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two lower
+/// double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is greater than or equal to the
+/// corresponding value in the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower double-precision values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
+}
+
+/// Compares the lower double-precision floating-point values in each of
+/// the two 128-bit floating-point vectors of [2 x double] to determine if
+/// the value in the first parameter is unequal to the corresponding value in
+/// the second parameter.
+///
+/// The comparison yields 0 for false, 1 for true. If either of the two lower
+/// double-precision values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __b.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision value is
+/// compared to the lower double-precision value of \a __a.
+/// \returns An integer containing the comparison result. If either of the two
+/// lower double-precision values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_sd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two single-precision floating-point
+/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
+/// The upper 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+/// converted values. The upper 64 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtpd_ps(__m128d __a)
+{
+ return __builtin_ia32_cvtpd2ps((__v2df)__a);
+}
+
+/// Converts the lower two single-precision floating-point elements of a
+/// 128-bit vector of [4 x float] into two double-precision floating-point
+/// values, returned in a 128-bit vector of [2 x double]. The upper two
+/// elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower two single-precision
+/// floating-point elements are converted to double-precision values. The
+/// upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtps_pd(__m128 __a)
+{
+ return (__m128d) __builtin_convertvector(
+ __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
+}
+
+/// Converts the lower two integer elements of a 128-bit vector of
+/// [4 x i32] into two double-precision floating-point values, returned in a
+/// 128-bit vector of [2 x double].
+///
+/// The upper two elements of the input vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
+/// converted to double-precision values.
+///
+/// The upper two elements are unused.
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtepi32_pd(__m128i __a)
+{
+ return (__m128d) __builtin_convertvector(
+ __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
+/// 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+/// converted values. The upper 64 bits are set to zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtpd_epi32(__m128d __a)
+{
+ return __builtin_ia32_cvtpd2dq((__v2df)__a);
+}
+
+/// Converts the low-order element of a 128-bit vector of [2 x double]
+/// into a 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+/// conversion.
+/// \returns A 32-bit signed integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsd_si32(__m128d __a)
+{
+ return __builtin_ia32_cvtsd2si((__v2df)__a);
+}
+
+/// Converts the lower double-precision floating-point element of a
+/// 128-bit vector of [2 x double], in the second parameter, into a
+/// single-precision floating-point value, returned in the lower 32 bits of a
+/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
+/// copied from the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
+/// copied to the upper 96 bits of the result.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower double-precision
+/// floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
+/// converted value from the second parameter. The upper 96 bits are copied
+/// from the upper 96 bits of the first parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsd_ss(__m128 __a, __m128d __b)
+{
+ return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
+}
+
+/// Converts a 32-bit signed integer value, in the second parameter, into
+/// a double-precision floating-point value, returned in the lower 64 bits of
+/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+/// are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+/// copied to the upper 64 bits of the result.
+/// \param __b
+/// A 32-bit signed integer containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+/// converted value from the second parameter. The upper 64 bits are copied
+/// from the upper 64 bits of the first parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi32_sd(__m128d __a, int __b)
+{
+ __a[0] = __b;
+ return __a;
+}
+
+/// Converts the lower single-precision floating-point element of a
+/// 128-bit vector of [4 x float], in the second parameter, into a
+/// double-precision floating-point value, returned in the lower 64 bits of
+/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
+/// are copied from the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
+/// copied to the upper 64 bits of the result.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower single-precision
+/// floating-point element is used in the conversion.
+/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
+/// converted value from the second parameter. The upper 64 bits are copied
+/// from the upper 64 bits of the first parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtss_sd(__m128d __a, __m128 __b)
+{
+ __a[0] = __b[0];
+ return __a;
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
+///
+/// If the result of either conversion is inexact, the result is truncated
+/// (rounded towards zero) regardless of the current MXCSR setting. The upper
+/// 64 bits of the result vector are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
+/// converted values. The upper 64 bits are set to zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttpd_epi32(__m128d __a)
+{
+ return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
+}
+
+/// Converts the low-order element of a [2 x double] vector into a 32-bit
+/// signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+/// conversion.
+/// \returns A 32-bit signed integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttsd_si32(__m128d __a)
+{
+ return __builtin_ia32_cvttsd2si((__v2df)__a);
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpd_pi32(__m128d __a)
+{
+ return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
+}
+
+/// Converts the two double-precision floating-point elements of a
+/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
+/// returned in a 64-bit vector of [2 x i32].
+///
+/// If the result of either conversion is inexact, the result is truncated
+/// (rounded towards zero) regardless of the current MXCSR setting.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double].
+/// \returns A 64-bit vector of [2 x i32] containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvttpd_pi32(__m128d __a)
+{
+ return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
+}
+
+/// Converts the two signed 32-bit integer elements of a 64-bit vector of
+/// [2 x i32] into two double-precision floating-point values, returned in a
+/// 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
+///
+/// \param __a
+/// A 64-bit vector of [2 x i32].
+/// \returns A 128-bit vector of [2 x double] containing the converted values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpi32_pd(__m64 __a)
+{
+ return __builtin_ia32_cvtpi2pd((__v2si)__a);
+}
+
+/// Returns the low-order element of a 128-bit vector of [2 x double] as
+/// a double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
+/// \returns A double-precision floating-point value copied from the lower 64
+/// bits of \a __a.
+static __inline__ double __DEFAULT_FN_ATTRS
+_mm_cvtsd_f64(__m128d __a)
+{
+ return __a[0];
+}
+
+/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 16-byte aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_pd(double const *__dp)
+{
+ return *(const __m128d*)__dp;
+}
+
+/// Loads a double-precision floating-point value from a specified memory
+/// location and duplicates it to both vector elements of a 128-bit vector of
+/// [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
+///
+/// \param __dp
+/// A pointer to a memory location containing a double-precision value.
+/// \returns A 128-bit vector of [2 x double] containing the loaded and
+/// duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load1_pd(double const *__dp)
+{
+ struct __mm_load1_pd_struct {
+ double __u;
+ } __attribute__((__packed__, __may_alias__));
+ double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u;
+ return __extension__ (__m128d){ __u, __u };
+}
+
+#define _mm_load_pd1(dp) _mm_load1_pd(dp)
+
+/// Loads two double-precision values, in reverse order, from an aligned
+/// memory location into a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
+/// needed shuffling instructions. In AVX mode, the shuffling may be combined
+/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
+///
+/// \param __dp
+/// A 16-byte aligned pointer to an array of double-precision values to be
+/// loaded in reverse order.
+/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
+/// values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadr_pd(double const *__dp)
+{
+ __m128d __u = *(const __m128d*)__dp;
+ return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
+}
+
+/// Loads a 128-bit floating-point vector of [2 x double] from an
+/// unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadu_pd(double const *__dp)
+{
+ struct __loadu_pd {
+ __m128d_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_pd*)__dp)->__v;
+}
+
+/// Loads a 64-bit integer value to the low element of a 128-bit integer
+/// vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+/// A pointer to a 64-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si64(void const *__a)
+{
+ struct __loadu_si64 {
+ long long __v;
+ } __attribute__((__packed__, __may_alias__));
+ long long __u = ((const struct __loadu_si64*)__a)->__v;
+ return __extension__ (__m128i)(__v2di){__u, 0LL};
+}
+
+/// Loads a 32-bit integer value to the low element of a 128-bit integer
+/// vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+/// A pointer to a 32-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si32(void const *__a)
+{
+ struct __loadu_si32 {
+ int __v;
+ } __attribute__((__packed__, __may_alias__));
+ int __u = ((const struct __loadu_si32*)__a)->__v;
+ return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
+}
+
+/// Loads a 16-bit integer value to the low element of a 128-bit integer
+/// vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __a
+/// A pointer to a 16-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si16(void const *__a)
+{
+ struct __loadu_si16 {
+ short __v;
+ } __attribute__((__packed__, __may_alias__));
+ short __u = ((const struct __loadu_si16*)__a)->__v;
+ return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
+}
+
+/// Loads a 64-bit double-precision value to the low element of a
+/// 128-bit integer vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a memory location containing a double-precision value.
+/// The address of the memory location does not have to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the loaded value.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_load_sd(double const *__dp)
+{
+ struct __mm_load_sd_struct {
+ double __u;
+ } __attribute__((__packed__, __may_alias__));
+ double __u = ((const struct __mm_load_sd_struct*)__dp)->__u;
+ return __extension__ (__m128d){ __u, 0 };
+}
+
+/// Loads a double-precision value into the high-order bits of a 128-bit
+/// vector of [2 x double]. The low-order bits are copied from the low-order
+/// bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [63:0] are written to bits [63:0] of the result.
+/// \param __dp
+/// A pointer to a 64-bit memory location containing a double-precision
+/// floating-point value that is loaded. The loaded value is written to bits
+/// [127:64] of the result. The address of the memory location does not have
+/// to be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadh_pd(__m128d __a, double const *__dp)
+{
+ struct __mm_loadh_pd_struct {
+ double __u;
+ } __attribute__((__packed__, __may_alias__));
+ double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u;
+ return __extension__ (__m128d){ __a[0], __u };
+}
+
+/// Loads a double-precision value into the low-order bits of a 128-bit
+/// vector of [2 x double]. The high-order bits are copied from the
+/// high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [127:64] are written to bits [127:64] of the result.
+/// \param __dp
+/// A pointer to a 64-bit memory location containing a double-precision
+/// floating-point value that is loaded. The loaded value is written to bits
+/// [63:0] of the result. The address of the memory location does not have to
+/// be aligned.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_loadl_pd(__m128d __a, double const *__dp)
+{
+ struct __mm_loadl_pd_struct {
+ double __u;
+ } __attribute__((__packed__, __may_alias__));
+ double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u;
+ return __extension__ (__m128d){ __u, __a[1] };
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double] with
+/// unspecified content. This could be used as an argument to another
+/// intrinsic function where the argument is required but the value is not
+/// actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
+/// content.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_undefined_pd(void)
+{
+ return (__m128d)__builtin_ia32_undef128();
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
+/// 64 bits of the vector are initialized with the specified double-precision
+/// floating-point value. The upper 64 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize the lower 64
+/// bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
+/// lower 64 bits contain the value of the parameter. The upper 64 bits are
+/// set to zero.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_sd(double __w)
+{
+ return __extension__ (__m128d){ __w, 0 };
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double], with each
+/// of the two double-precision floating-point vector elements set to the
+/// specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set1_pd(double __w)
+{
+ return __extension__ (__m128d){ __w, __w };
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double], with each
+/// of the two double-precision floating-point vector elements set to the
+/// specified double-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd1(double __w)
+{
+ return _mm_set1_pd(__w);
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double]
+/// initialized with the specified double-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize the upper 64
+/// bits of the result.
+/// \param __x
+/// A double-precision floating-point value used to initialize the lower 64
+/// bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_set_pd(double __w, double __x)
+{
+ return __extension__ (__m128d){ __x, __w };
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double],
+/// initialized in reverse order with the specified double-precision
+/// floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __w
+/// A double-precision floating-point value used to initialize the lower 64
+/// bits of the result.
+/// \param __x
+/// A double-precision floating-point value used to initialize the upper 64
+/// bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [2 x double].
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setr_pd(double __w, double __x)
+{
+ return __extension__ (__m128d){ __w, __x };
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double]
+/// initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [2 x double] with
+/// all elements set to zero.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_setzero_pd(void)
+{
+ return __extension__ (__m128d){ 0, 0 };
+}
+
+/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
+/// 64 bits are set to the lower 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
+/// upper 64 bits of the result.
+/// \param __b
+/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
+/// lower 64 bits of the result.
+/// \returns A 128-bit vector of [2 x double] containing the moved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_move_sd(__m128d __a, __m128d __b)
+{
+ __a[0] = __b[0];
+ return __a;
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 64-bit memory location.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_sd(double *__dp, __m128d __a)
+{
+ struct __mm_store_sd_struct {
+ double __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
+}
+
+/// Moves packed double-precision values from a 128-bit vector of
+/// [2 x double] to a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
+///
+/// \param __dp
+/// A pointer to an aligned memory location that can store two
+/// double-precision values.
+/// \param __a
+/// A packed 128-bit vector of [2 x double] containing the values to be
+/// moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd(double *__dp, __m128d __a)
+{
+ *(__m128d*)__dp = __a;
+}
+
+/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+/// the upper and lower 64 bits of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the
+/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
+///
+/// \param __dp
+/// A pointer to a memory location that can store two double-precision
+/// values.
+/// \param __a
+/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
+/// of the values in \a __dp.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_pd(double *__dp, __m128d __a)
+{
+ __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+ _mm_store_pd(__dp, __a);
+}
+
+/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
+/// the upper and lower 64 bits of a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the
+/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
+///
+/// \param __dp
+/// A pointer to a memory location that can store two double-precision
+/// values.
+/// \param __a
+/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
+/// of the values in \a __dp.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_pd1(double *__dp, __m128d __a)
+{
+ _mm_store1_pd(__dp, __a);
+}
+
+/// Stores a 128-bit vector of [2 x double] into an unaligned memory
+/// location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_pd(double *__dp, __m128d __a)
+{
+ struct __storeu_pd {
+ __m128d_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_pd*)__dp)->__v = __a;
+}
+
+/// Stores two double-precision values, in reverse order, from a 128-bit
+/// vector of [2 x double] to a 16-byte aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to a shuffling instruction followed by a
+/// <c> VMOVAPD / MOVAPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 16-byte aligned memory location that can store two
+/// double-precision values.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values to be reversed and
+/// stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_pd(double *__dp, __m128d __a)
+{
+ __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
+ *(__m128d *)__dp = __a;
+}
+
+/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 64-bit memory location.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pd(double *__dp, __m128d __a)
+{
+ struct __mm_storeh_pd_struct {
+ double __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __dp
+/// A pointer to a 64-bit memory location.
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pd(double *__dp, __m128d __a)
+{
+ struct __mm_storeh_pd_struct {
+ double __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
+}
+
+/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
+/// saving the lower 8 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [16 x i8].
+///
+/// The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [16 x i8].
+/// \param __b
+/// A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the sums of both
+/// parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi8(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v16qu)__a + (__v16qu)__b);
+}
+
+/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
+/// saving the lower 16 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [8 x i16].
+///
+/// The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16].
+/// \param __b
+/// A 128-bit vector of [8 x i16].
+/// \returns A 128-bit vector of [8 x i16] containing the sums of both
+/// parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v8hu)__a + (__v8hu)__b);
+}
+
+/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
+/// saving the lower 32 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [4 x i32].
+///
+/// The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32].
+/// \param __b
+/// A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the sums of both
+/// parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v4su)__a + (__v4su)__b);
+}
+
+/// Adds two signed or unsigned 64-bit integer values, returning the
+/// lower 64 bits of the sum.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer.
+/// \param __b
+/// A 64-bit integer.
+/// \returns A 64-bit integer containing the sum of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_add_si64(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
+}
+
+/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
+/// saving the lower 64 bits of each sum in the corresponding element of a
+/// 128-bit result vector of [2 x i64].
+///
+/// The integer elements of both parameters can be either signed or unsigned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x i64].
+/// \param __b
+/// A 128-bit vector of [2 x i64].
+/// \returns A 128-bit vector of [2 x i64] containing the sums of both
+/// parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_add_epi64(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a + (__v2du)__b);
+}
+
+/// Adds, with saturation, the corresponding elements of two 128-bit
+/// signed [16 x i8] vectors, saving each sum in the corresponding element of
+/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
+/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [16 x i8] vector.
+/// \param __b
+/// A 128-bit signed [16 x i8] vector.
+/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
+/// both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi8(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
+#else
+ return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
+#endif
+}
+
+/// Adds, with saturation, the corresponding elements of two 128-bit
+/// signed [8 x i16] vectors, saving each sum in the corresponding element of
+/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
+/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+/// 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
+/// both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epi16(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
+#else
+ return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
+#endif
+}
+
+/// Adds, with saturation, the corresponding elements of two 128-bit
+/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
+/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
+/// are saturated to 0xFF. Negative sums are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
+/// of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu8(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
+#else
+ return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
+#endif
+}
+
+/// Adds, with saturation, the corresponding elements of two 128-bit
+/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
+/// of a 128-bit result vector of [8 x i16]. Positive sums greater than
+/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+/// A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
+/// of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_adds_epu16(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
+#else
+ return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
+#endif
+}
+
+/// Computes the rounded averages of corresponding elements of two
+/// 128-bit unsigned [16 x i8] vectors, saving each result in the
+/// corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
+/// averages of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu8(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// Computes the rounded averages of corresponding elements of two
+/// 128-bit unsigned [8 x i16] vectors, saving each result in the
+/// corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+/// A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
+/// averages of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_avg_epu16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
+/// vectors, producing eight intermediate 32-bit signed integer products, and
+/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
+/// [4 x i32] vector.
+///
+/// For example, bits [15:0] of both parameters are multiplied producing a
+/// 32-bit product, bits [31:16] of both parameters are multiplied producing
+/// a 32-bit product, and the sum of those two products becomes bits [31:0]
+/// of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
+/// of both parameters.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_madd_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Compares corresponding elements of two 128-bit signed [8 x i16]
+/// vectors, saving the greater value from each comparison in the
+/// corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
+/// each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi16(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
+#else
+ return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
+#endif
+}
+
+/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
+/// vectors, saving the greater value from each comparison in the
+/// corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
+/// each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu8(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
+#else
+ return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
+#endif
+}
+
+/// Compares corresponding elements of two 128-bit signed [8 x i16]
+/// vectors, saving the smaller value from each comparison in the
+/// corresponding element of a 128-bit result vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
+/// each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi16(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
+#else
+ return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
+#endif
+}
+
+/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
+/// vectors, saving the smaller value from each comparison in the
+/// corresponding element of a 128-bit result vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [16 x i8] vector.
+/// \param __b
+/// A 128-bit unsigned [16 x i8] vector.
+/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
+/// each comparison.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu8(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
+#else
+ return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
+#endif
+}
+
+/// Multiplies the corresponding elements of two signed [8 x i16]
+/// vectors, saving the upper 16 bits of each 32-bit product in the
+/// corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
+/// each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Multiplies the corresponding elements of two unsigned [8 x i16]
+/// vectors, saving the upper 16 bits of each 32-bit product in the
+/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
+///
+/// \param __a
+/// A 128-bit unsigned [8 x i16] vector.
+/// \param __b
+/// A 128-bit unsigned [8 x i16] vector.
+/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
+/// of each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhi_epu16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Multiplies the corresponding elements of two signed [8 x i16]
+/// vectors, saving the lower 16 bits of each 32-bit product in the
+/// corresponding element of a 128-bit signed [8 x i16] result vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
+///
+/// \param __a
+/// A 128-bit signed [8 x i16] vector.
+/// \param __b
+/// A 128-bit signed [8 x i16] vector.
+/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
+/// each of the eight 32-bit products.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v8hu)__a * (__v8hu)__b);
+}
+
+/// Multiplies 32-bit unsigned integer values contained in the lower bits
+/// of the two 64-bit integer vectors and returns the 64-bit unsigned
+/// product.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer containing one of the source operands.
+/// \param __b
+/// A 64-bit integer containing one of the source operands.
+/// \returns A 64-bit integer vector containing the product of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_mul_su32(__m64 __a, __m64 __b)
+{
+ return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
+}
+
+/// Multiplies 32-bit unsigned integer values contained in the lower
+/// bits of the corresponding elements of two [2 x i64] vectors, and returns
+/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
+///
+/// \param __a
+/// A [2 x i64] vector containing one of the source operands.
+/// \param __b
+/// A [2 x i64] vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the product of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epu32(__m128i __a, __m128i __b)
+{
+ return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
+}
+
+/// Computes the absolute differences of corresponding 8-bit integer
+/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
+/// separately sums the second 8 absolute differences. Packs these two
+/// unsigned 16-bit integer sums into the upper and lower elements of a
+/// [2 x i64] vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 128-bit integer vector containing one of the source operands.
+/// \returns A [2 x i64] vector containing the sums of the sets of absolute
+/// differences between both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sad_epu8(__m128i __a, __m128i __b)
+{
+ return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// Subtracts the corresponding 8-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+/// in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi8(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v16qu)__a - (__v16qu)__b);
+}
+
+/// Subtracts the corresponding 16-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+/// in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v8hu)__a - (__v8hu)__b);
+}
+
+/// Subtracts the corresponding 32-bit integer values in the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+/// in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v4su)__a - (__v4su)__b);
+}
+
+/// Subtracts signed or unsigned 64-bit integer values and writes the
+/// difference to the corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing the minuend.
+/// \param __b
+/// A 64-bit integer vector containing the subtrahend.
+/// \returns A 64-bit integer vector containing the difference of the values in
+/// the operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_sub_si64(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
+}
+
+/// Subtracts the corresponding elements of two [2 x i64] vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+/// in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sub_epi64(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a - (__v2du)__b);
+}
+
+/// Subtracts corresponding 8-bit signed integer values in the input and
+/// returns the differences in the corresponding bytes in the destination.
+/// Differences greater than 0x7F are saturated to 0x7F, and differences less
+/// than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+/// in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi8(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
+#else
+ return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
+#endif
+}
+
+/// Subtracts corresponding 16-bit signed integer values in the input and
+/// returns the differences in the corresponding bytes in the destination.
+/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
+/// than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the differences of the values
+/// in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epi16(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
+#else
+ return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
+#endif
+}
+
+/// Subtracts corresponding 8-bit unsigned integer values in the input
+/// and returns the differences in the corresponding bytes in the
+/// destination. Differences less than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+/// differences of the values in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu8(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
+#else
+ return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
+#endif
+}
+
+/// Subtracts corresponding 16-bit unsigned integer values in the input
+/// and returns the differences in the corresponding bytes in the
+/// destination. Differences less than 0x0000 are saturated to 0x0000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the minuends.
+/// \param __b
+/// A 128-bit integer vector containing the subtrahends.
+/// \returns A 128-bit integer vector containing the unsigned integer
+/// differences of the values in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_subs_epu16(__m128i __a, __m128i __b)
+{
+#if (__clang_major__ > 14)
+ return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
+#else
+ return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
+#endif
+}
+
+/// Performs a bitwise AND of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise AND of the values
+/// in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_and_si128(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a & (__v2du)__b);
+}
+
+/// Performs a bitwise AND of two 128-bit integer vectors, using the
+/// one's complement of the values contained in the first source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector containing the left source operand. The one's complement
+/// of this value is used in the bitwise AND.
+/// \param __b
+/// A 128-bit vector containing the right source operand.
+/// \returns A 128-bit integer vector containing the bitwise AND of the one's
+/// complement of the first operand and the values in the second operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_andnot_si128(__m128i __a, __m128i __b)
+{
+ return (__m128i)(~(__v2du)__a & (__v2du)__b);
+}
+/// Performs a bitwise OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise OR of the values
+/// in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_or_si128(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a | (__v2du)__b);
+}
+
+/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 128-bit integer vector containing one of the source operands.
+/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
+/// values in both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_xor_si128(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v2du)__a ^ (__v2du)__b);
+}
+
+/// Left-shifts the 128-bit integer vector operand by the specified
+/// number of bytes. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_slli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector containing the source operand.
+/// \param imm
+/// An immediate value specifying the number of bytes to left-shift operand
+/// \a a.
+/// \returns A 128-bit integer vector containing the left-shifted value.
+#define _mm_slli_si128(a, imm) \
+ ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
+
+#define _mm_bslli_si128(a, imm) \
+ ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
+
+/// Left-shifts each 16-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to left-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi16(__m128i __a, int __count)
+{
+ return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
+}
+
+/// Left-shifts each 16-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi16(__m128i __a, __m128i __count)
+{
+ return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// Left-shifts each 32-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to left-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi32(__m128i __a, int __count)
+{
+ return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
+}
+
+/// Left-shifts each 32-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi32(__m128i __a, __m128i __count)
+{
+ return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
+}
+
+/// Left-shifts each 64-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to left-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_slli_epi64(__m128i __a, int __count)
+{
+ return __builtin_ia32_psllqi128((__v2di)__a, __count);
+}
+
+/// Left-shifts each 64-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. Low-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to left-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the left-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sll_epi64(__m128i __a, __m128i __count)
+{
+ return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
+}
+
+/// Right-shifts each 16-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. High-order bits are filled with the sign
+/// bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to right-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi16(__m128i __a, int __count)
+{
+ return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
+}
+
+/// Right-shifts each 16-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. High-order bits are filled with the sign
+/// bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi16(__m128i __a, __m128i __count)
+{
+ return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// Right-shifts each 32-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. High-order bits are filled with the sign
+/// bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to right-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srai_epi32(__m128i __a, int __count)
+{
+ return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
+}
+
+/// Right-shifts each 32-bit value in the 128-bit integer vector operand
+/// by the specified number of bits. High-order bits are filled with the sign
+/// bit of the initial value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sra_epi32(__m128i __a, __m128i __count)
+{
+ return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
+}
+
+/// Right-shifts the 128-bit integer vector operand by the specified
+/// number of bytes. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_srli_si128(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector containing the source operand.
+/// \param imm
+/// An immediate value specifying the number of bytes to right-shift operand
+/// \a a.
+/// \returns A 128-bit integer vector containing the right-shifted value.
+#define _mm_srli_si128(a, imm) \
+ ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
+
+#define _mm_bsrli_si128(a, imm) \
+ ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
+
+/// Right-shifts each of 16-bit values in the 128-bit integer vector
+/// operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to right-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi16(__m128i __a, int __count)
+{
+ return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
+}
+
+/// Right-shifts each of 16-bit values in the 128-bit integer vector
+/// operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi16(__m128i __a, __m128i __count)
+{
+ return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
+}
+
+/// Right-shifts each of 32-bit values in the 128-bit integer vector
+/// operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to right-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi32(__m128i __a, int __count)
+{
+ return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
+}
+
+/// Right-shifts each of 32-bit values in the 128-bit integer vector
+/// operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi32(__m128i __a, __m128i __count)
+{
+ return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
+}
+
+/// Right-shifts each of 64-bit values in the 128-bit integer vector
+/// operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// An integer value specifying the number of bits to right-shift each value
+/// in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srli_epi64(__m128i __a, int __count)
+{
+ return __builtin_ia32_psrlqi128((__v2di)__a, __count);
+}
+
+/// Right-shifts each of 64-bit values in the 128-bit integer vector
+/// operand by the specified number of bits. High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the source operand.
+/// \param __count
+/// A 128-bit integer vector in which bits [63:0] specify the number of bits
+/// to right-shift each value in operand \a __a.
+/// \returns A 128-bit integer vector containing the right-shifted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_srl_epi64(__m128i __a, __m128i __count)
+{
+ return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
+}
+
+/// Compares each of the corresponding 8-bit values of the 128-bit
+/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
+/// for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi8(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v16qi)__a == (__v16qi)__b);
+}
+
+/// Compares each of the corresponding 16-bit values of the 128-bit
+/// integer vectors for equality. Each comparison yields 0x0 for false,
+/// 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v8hi)__a == (__v8hi)__b);
+}
+
+/// Compares each of the corresponding 32-bit values of the 128-bit
+/// integer vectors for equality. Each comparison yields 0x0 for false,
+/// 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v4si)__a == (__v4si)__b);
+}
+
+/// Compares each of the corresponding signed 8-bit values of the 128-bit
+/// integer vectors to determine if the values in the first operand are
+/// greater than those in the second operand. Each comparison yields 0x0 for
+/// false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi8(__m128i __a, __m128i __b)
+{
+ /* This function always performs a signed comparison, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m128i)((__v16qs)__a > (__v16qs)__b);
+}
+
+/// Compares each of the corresponding signed 16-bit values of the
+/// 128-bit integer vectors to determine if the values in the first operand
+/// are greater than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v8hi)__a > (__v8hi)__b);
+}
+
+/// Compares each of the corresponding signed 32-bit values of the
+/// 128-bit integer vectors to determine if the values in the first operand
+/// are greater than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)((__v4si)__a > (__v4si)__b);
+}
+
+/// Compares each of the corresponding signed 8-bit values of the 128-bit
+/// integer vectors to determine if the values in the first operand are less
+/// than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi8(__m128i __a, __m128i __b)
+{
+ return _mm_cmpgt_epi8(__b, __a);
+}
+
+/// Compares each of the corresponding signed 16-bit values of the
+/// 128-bit integer vectors to determine if the values in the first operand
+/// are less than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi16(__m128i __a, __m128i __b)
+{
+ return _mm_cmpgt_epi16(__b, __a);
+}
+
+/// Compares each of the corresponding signed 32-bit values of the
+/// 128-bit integer vectors to determine if the values in the first operand
+/// are less than those in the second operand.
+///
+/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __b
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmplt_epi32(__m128i __a, __m128i __b)
+{
+ return _mm_cmpgt_epi32(__b, __a);
+}
+
+#ifdef __x86_64__
+/// Converts a 64-bit signed integer value from the second operand into a
+/// double-precision value and returns it in the lower element of a [2 x
+/// double] vector; the upper element of the returned vector is copied from
+/// the upper element of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
+/// copied to the upper 64 bits of the destination.
+/// \param __b
+/// A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
+/// converted value of the second operand. The upper 64 bits are copied from
+/// the upper 64 bits of the first operand.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_cvtsi64_sd(__m128d __a, long long __b)
+{
+ __a[0] = __b;
+ return __a;
+}
+
+/// Converts the first (lower) element of a vector of [2 x double] into a
+/// 64-bit signed integer value, according to the current rounding mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+/// conversion.
+/// \returns A 64-bit signed integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsd_si64(__m128d __a)
+{
+ return __builtin_ia32_cvtsd2si64((__v2df)__a);
+}
+
+/// Converts the first (lower) element of a vector of [2 x double] into a
+/// 64-bit signed integer value, truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
+/// conversion.
+/// \returns A 64-bit signed integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttsd_si64(__m128d __a)
+{
+ return __builtin_ia32_cvttsd2si64((__v2df)__a);
+}
+#endif
+
+/// Converts a vector of [4 x i32] into a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 128-bit vector of [4 x float] containing the converted values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtepi32_ps(__m128i __a)
+{
+ return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
+}
+
+/// Converts a vector of [4 x float] into a vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit integer vector of [4 x i32] containing the converted
+/// values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtps_epi32(__m128 __a)
+{
+ return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
+}
+
+/// Converts a vector of [4 x float] into a vector of [4 x i32],
+/// truncating the result when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x i32] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvttps_epi32(__m128 __a)
+{
+ return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
+}
+
+/// Returns a vector of [4 x i32] where the lowest element is the input
+/// operand and the remaining elements are zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+/// A 32-bit signed integer operand.
+/// \returns A 128-bit vector of [4 x i32].
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si128(int __a)
+{
+ return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
+}
+
+#ifdef __x86_64__
+/// Returns a vector of [2 x i64] where the lower element is the input
+/// operand and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+/// A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [2 x i64] containing the converted value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si128(long long __a)
+{
+ return __extension__ (__m128i)(__v2di){ __a, 0 };
+}
+#endif
+
+/// Moves the least significant 32 bits of a vector of [4 x i32] to a
+/// 32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+/// A vector of [4 x i32]. The least significant 32 bits are moved to the
+/// destination.
+/// \returns A 32-bit signed integer containing the moved value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si32(__m128i __a)
+{
+ __v4si __b = (__v4si)__a;
+ return __b[0];
+}
+
+#ifdef __x86_64__
+/// Moves the least significant 64 bits of a vector of [2 x i64] to a
+/// 64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+/// A vector of [2 x i64]. The least significant 64 bits are moved to the
+/// destination.
+/// \returns A 64-bit signed integer containing the moved value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtsi128_si64(__m128i __a)
+{
+ return __a[0];
+}
+#endif
+
+/// Moves packed integer values from an aligned 128-bit memory location
+/// to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
+///
+/// \param __p
+/// An aligned pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_load_si128(__m128i const *__p)
+{
+ return *__p;
+}
+
+/// Moves packed integer values from an unaligned 128-bit memory location
+/// to elements in a 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location containing integer values.
+/// \returns A 128-bit integer vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si128(__m128i_u const *__p)
+{
+ struct __loadu_si128 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_si128*)__p)->__v;
+}
+
+/// Returns a vector of [2 x i64] where the lower element is taken from
+/// the lower element of the operand, and the upper element is zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __p
+/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
+/// the destination.
+/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
+/// moved value. The higher order bits are cleared.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadl_epi64(__m128i_u const *__p)
+{
+ struct __mm_loadl_epi64_struct {
+ long long __u;
+ } __attribute__((__packed__, __may_alias__));
+ return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0};
+}
+
+/// Generates a 128-bit vector of [4 x i32] with unspecified content.
+/// This could be used as an argument to another intrinsic function where the
+/// argument is required but the value is not actually used.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x i32] with unspecified content.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_undefined_si128(void)
+{
+ return (__m128i)__builtin_ia32_undef128();
+}
+
+/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+/// the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __q1
+/// A 64-bit integer value used to initialize the upper 64 bits of the
+/// destination vector of [2 x i64].
+/// \param __q0
+/// A 64-bit integer value used to initialize the lower 64 bits of the
+/// destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+/// provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64x(long long __q1, long long __q0)
+{
+ return __extension__ (__m128i)(__v2di){ __q0, __q1 };
+}
+
+/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
+/// the specified 64-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __q1
+/// A 64-bit integer value used to initialize the upper 64 bits of the
+/// destination vector of [2 x i64].
+/// \param __q0
+/// A 64-bit integer value used to initialize the lower 64 bits of the
+/// destination vector of [2 x i64].
+/// \returns An initialized 128-bit vector of [2 x i64] containing the values
+/// provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi64(__m64 __q1, __m64 __q0)
+{
+ return _mm_set_epi64x((long long)__q1, (long long)__q0);
+}
+
+/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
+/// the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i3
+/// A 32-bit integer value used to initialize bits [127:96] of the
+/// destination vector.
+/// \param __i2
+/// A 32-bit integer value used to initialize bits [95:64] of the destination
+/// vector.
+/// \param __i1
+/// A 32-bit integer value used to initialize bits [63:32] of the destination
+/// vector.
+/// \param __i0
+/// A 32-bit integer value used to initialize bits [31:0] of the destination
+/// vector.
+/// \returns An initialized 128-bit vector of [4 x i32] containing the values
+/// provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
+{
+ return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
+}
+
+/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
+/// the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w7
+/// A 16-bit integer value used to initialize bits [127:112] of the
+/// destination vector.
+/// \param __w6
+/// A 16-bit integer value used to initialize bits [111:96] of the
+/// destination vector.
+/// \param __w5
+/// A 16-bit integer value used to initialize bits [95:80] of the destination
+/// vector.
+/// \param __w4
+/// A 16-bit integer value used to initialize bits [79:64] of the destination
+/// vector.
+/// \param __w3
+/// A 16-bit integer value used to initialize bits [63:48] of the destination
+/// vector.
+/// \param __w2
+/// A 16-bit integer value used to initialize bits [47:32] of the destination
+/// vector.
+/// \param __w1
+/// A 16-bit integer value used to initialize bits [31:16] of the destination
+/// vector.
+/// \param __w0
+/// A 16-bit integer value used to initialize bits [15:0] of the destination
+/// vector.
+/// \returns An initialized 128-bit vector of [8 x i16] containing the values
+/// provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
+{
+ return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
+}
+
+/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
+/// the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b15
+/// Initializes bits [127:120] of the destination vector.
+/// \param __b14
+/// Initializes bits [119:112] of the destination vector.
+/// \param __b13
+/// Initializes bits [111:104] of the destination vector.
+/// \param __b12
+/// Initializes bits [103:96] of the destination vector.
+/// \param __b11
+/// Initializes bits [95:88] of the destination vector.
+/// \param __b10
+/// Initializes bits [87:80] of the destination vector.
+/// \param __b9
+/// Initializes bits [79:72] of the destination vector.
+/// \param __b8
+/// Initializes bits [71:64] of the destination vector.
+/// \param __b7
+/// Initializes bits [63:56] of the destination vector.
+/// \param __b6
+/// Initializes bits [55:48] of the destination vector.
+/// \param __b5
+/// Initializes bits [47:40] of the destination vector.
+/// \param __b4
+/// Initializes bits [39:32] of the destination vector.
+/// \param __b3
+/// Initializes bits [31:24] of the destination vector.
+/// \param __b2
+/// Initializes bits [23:16] of the destination vector.
+/// \param __b1
+/// Initializes bits [15:8] of the destination vector.
+/// \param __b0
+/// Initializes bits [7:0] of the destination vector.
+/// \returns An initialized 128-bit vector of [16 x i8] containing the values
+/// provided in the operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
+{
+ return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
+}
+
+/// Initializes both values in a 128-bit integer vector with the
+/// specified 64-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __q
+/// Integer value used to initialize the elements of the destination integer
+/// vector.
+/// \returns An initialized 128-bit integer vector of [2 x i64] with both
+/// elements containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64x(long long __q)
+{
+ return _mm_set_epi64x(__q, __q);
+}
+
+/// Initializes both values in a 128-bit vector of [2 x i64] with the
+/// specified 64-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __q
+/// A 64-bit value used to initialize the elements of the destination integer
+/// vector.
+/// \returns An initialized 128-bit vector of [2 x i64] with all elements
+/// containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi64(__m64 __q)
+{
+ return _mm_set_epi64(__q, __q);
+}
+
+/// Initializes all values in a 128-bit vector of [4 x i32] with the
+/// specified 32-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i
+/// A 32-bit value used to initialize the elements of the destination integer
+/// vector.
+/// \returns An initialized 128-bit vector of [4 x i32] with all elements
+/// containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi32(int __i)
+{
+ return _mm_set_epi32(__i, __i, __i, __i);
+}
+
+/// Initializes all values in a 128-bit vector of [8 x i16] with the
+/// specified 16-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w
+/// A 16-bit value used to initialize the elements of the destination integer
+/// vector.
+/// \returns An initialized 128-bit vector of [8 x i16] with all elements
+/// containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi16(short __w)
+{
+ return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
+}
+
+/// Initializes all values in a 128-bit vector of [16 x i8] with the
+/// specified 8-bit value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b
+/// An 8-bit value used to initialize the elements of the destination integer
+/// vector.
+/// \returns An initialized 128-bit vector of [16 x i8] with all elements
+/// containing the value provided in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_set1_epi8(char __b)
+{
+ return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+/// Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __q0
+/// A 64-bit integral value used to initialize the lower 64 bits of the
+/// result.
+/// \param __q1
+/// A 64-bit integral value used to initialize the upper 64 bits of the
+/// result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi64(__m64 __q0, __m64 __q1)
+{
+ return _mm_set_epi64(__q1, __q0);
+}
+
+/// Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i0
+/// A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+/// A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+/// A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+/// A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
+{
+ return _mm_set_epi32(__i3, __i2, __i1, __i0);
+}
+
+/// Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w0
+/// A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+/// A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+/// A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+/// A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+/// A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+/// A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+/// A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+/// A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
+{
+ return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
+}
+
+/// Constructs a 128-bit integer vector, initialized in reverse order
+/// with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b0
+/// An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+/// An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+/// An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+/// An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+/// An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+/// An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+/// An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+/// An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+/// An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+/// An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+/// An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+/// An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+/// An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+/// An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+/// An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+/// An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
+{
+ return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+/// Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+/// zero.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_setzero_si128(void)
+{
+ return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
+}
+
+/// Stores a 128-bit integer vector to a memory location aligned on a
+/// 128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+/// A pointer to an aligned memory location that will receive the integer
+/// values.
+/// \param __b
+/// A 128-bit integer vector containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_si128(__m128i *__p, __m128i __b)
+{
+ *__p = __b;
+}
+
+/// Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the integer values.
+/// \param __b
+/// A 128-bit integer vector containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si128(__m128i_u *__p, __m128i __b)
+{
+ struct __storeu_si128 {
+ __m128i_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_si128*)__p)->__v = __b;
+}
+
+/// Stores a 64-bit integer value from the low element of a 128-bit integer
+/// vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __p
+/// A pointer to a 64-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \param __b
+/// A 128-bit integer vector containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si64(void *__p, __m128i __b)
+{
+ struct __storeu_si64 {
+ long long __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
+}
+
+/// Stores a 32-bit integer value from the low element of a 128-bit integer
+/// vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __p
+/// A pointer to a 32-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \param __b
+/// A 128-bit integer vector containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si32(void *__p, __m128i __b)
+{
+ struct __storeu_si32 {
+ int __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
+}
+
+/// Stores a 16-bit integer value from the low element of a 128-bit integer
+/// vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __p
+/// A pointer to a 16-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \param __b
+/// A 128-bit integer vector containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si16(void *__p, __m128i __b)
+{
+ struct __storeu_si16 {
+ short __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
+}
+
+/// Moves bytes selected by the mask from the first operand to the
+/// specified unaligned memory location. When a mask bit is 1, the
+/// corresponding byte is written, otherwise it is not written.
+///
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon). Exception and trap behavior for elements not selected
+/// for storage to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
+/// instruction.
+///
+/// \param __d
+/// A 128-bit integer vector containing the values to be moved.
+/// \param __n
+/// A 128-bit integer vector containing the mask. The most significant bit of
+/// each byte represents the mask bits.
+/// \param __p
+/// A pointer to an unaligned 128-bit memory location where the specified
+/// values are moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
+{
+ __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
+}
+
+/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+/// a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 64-bit memory location that will receive the lower 64 bits
+/// of the integer vector parameter.
+/// \param __a
+/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+/// value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_epi64(__m128i_u *__p, __m128i __a)
+{
+ struct __mm_storel_epi64_struct {
+ long long __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
+}
+
+/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+/// aligned memory location.
+///
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+/// A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+/// A vector of [2 x double] containing the 64-bit values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_pd(double *__p, __m128d __a)
+{
+ __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
+}
+
+/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+/// A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+/// A 128-bit integer vector containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_si128(__m128i *__p, __m128i __a)
+{
+ __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
+}
+
+/// Stores a 32-bit integer value in the specified memory location.
+///
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
+///
+/// \param __p
+/// A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+/// A 32-bit integer containing the value to be stored.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+_mm_stream_si32(int *__p, int __a)
+{
+ __builtin_ia32_movnti(__p, __a);
+}
+
+#ifdef __x86_64__
+/// Stores a 64-bit integer value in the specified memory location.
+///
+/// To minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
+///
+/// \param __p
+/// A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+/// A 64-bit integer containing the value to be stored.
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
+_mm_stream_si64(long long *__p, long long __a)
+{
+ __builtin_ia32_movnti64(__p, __a);
+}
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// The cache line containing \a __p is flushed and invalidated from all
+/// caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
+///
+/// \param __p
+/// A pointer to the memory location used to identify the cache line to be
+/// flushed.
+void _mm_clflush(void const * __p);
+
+/// Forces strong memory ordering (serialization) between load
+/// instructions preceding this instruction and load instructions following
+/// this instruction, ensuring the system completes all previous loads before
+/// executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
+///
+void _mm_lfence(void);
+
+/// Forces strong memory ordering (serialization) between load and store
+/// instructions preceding this instruction and load and store instructions
+/// following this instruction, ensuring that the system completes all
+/// previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
+///
+void _mm_mfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// Converts 16-bit signed integers from both 128-bit integer vector
+/// operands into 8-bit signed integers, and packs the results into the
+/// destination. Positive values greater than 0x7F are saturated to 0x7F.
+/// Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to a 8-bit signed integer with
+/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+/// written to the lower 64 bits of the result.
+/// \param __b
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to a 8-bit signed integer with
+/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+/// written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Converts 32-bit signed integers from both 128-bit integer vector
+/// operands into 16-bit signed integers, and packs the results into the
+/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+/// Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+/// a signed integer and is converted to a 16-bit signed integer with
+/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+/// are written to the lower 64 bits of the result.
+/// \param __b
+/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+/// a signed integer and is converted to a 16-bit signed integer with
+/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+/// are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packs_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
+}
+
+/// Converts 16-bit signed integers from both 128-bit integer vector
+/// operands into 8-bit unsigned integers, and packs the results into the
+/// destination. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to an 8-bit unsigned integer with
+/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+/// written to the lower 64 bits of the result.
+/// \param __b
+/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+/// a signed integer and is converted to an 8-bit unsigned integer with
+/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+/// written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+/// the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \param __imm
+/// An immediate value. Bits [2:0] selects values from \a __a to be assigned
+/// to bits[15:0] of the result. \n
+/// 000: assign values from bits [15:0] of \a __a. \n
+/// 001: assign values from bits [31:16] of \a __a. \n
+/// 010: assign values from bits [47:32] of \a __a. \n
+/// 011: assign values from bits [63:48] of \a __a. \n
+/// 100: assign values from bits [79:64] of \a __a. \n
+/// 101: assign values from bits [95:80] of \a __a. \n
+/// 110: assign values from bits [111:96] of \a __a. \n
+/// 111: assign values from bits [127:112] of \a __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+/// integer vector parameter and the remaining bits are assigned zeros.
+#define _mm_extract_epi16(a, imm) \
+ ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
+ (int)(imm)))
+
+/// Constructs a 128-bit integer vector by first making a copy of the
+/// 128-bit integer vector parameter, and then inserting the lower 16 bits
+/// of an integer parameter into an offset specified by the immediate-value
+/// parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
+/// result and then one of the eight elements in the result is replaced by
+/// the lower 16 bits of \a __b.
+/// \param __b
+/// An integer. The lower 16 bits of this parameter are written to the
+/// result beginning at an offset specified by \a __imm.
+/// \param __imm
+/// An immediate value specifying the bit offset in the result at which the
+/// lower 16 bits of \a __b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi16(a, b, imm) \
+ ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
+ (int)(imm)))
+
+/// Copies the values of the most significant bits from each 8-bit
+/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+/// value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in \a __a,
+/// written to bits [15:0]. The other bits are assigned zeros.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_epi8(__m128i __a)
+{
+ return __builtin_ia32_pmovmskb128((__v16qi)__a);
+}
+
+/// Constructs a 128-bit integer vector by shuffling four 32-bit
+/// elements of a 128-bit integer vector parameter, using the immediate-value
+/// parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector containing the values to be copied.
+/// \param imm
+/// An immediate value containing an 8-bit value specifying which elements to
+/// copy from a. The destinations within the 128-bit destination are assigned
+/// values as follows: \n
+/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
+/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
+/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
+/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
+/// Bit value assignments: \n
+/// 00: assign values from bits [31:0] of \a a. \n
+/// 01: assign values from bits [63:32] of \a a. \n
+/// 10: assign values from bits [95:64] of \a a. \n
+/// 11: assign values from bits [127:96] of \a a.
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shuffle_epi32(a, imm) \
+ ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
+
+/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
+/// elements of a 128-bit integer vector of [8 x i16], using the immediate
+/// value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+/// [127:64] of the result.
+/// \param imm
+/// An 8-bit immediate value specifying which elements to copy from \a a. \n
+/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
+/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
+/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
+/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
+/// Bit value assignments: \n
+/// 00: assign values from bits [15:0] of \a a. \n
+/// 01: assign values from bits [31:16] of \a a. \n
+/// 10: assign values from bits [47:32] of \a a. \n
+/// 11: assign values from bits [63:48] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflelo_epi16(a, imm) \
+ ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
+
+/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
+/// elements of a 128-bit integer vector of [8 x i16], using the immediate
+/// value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
+///
+/// \param a
+/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+/// [63:0] of the result.
+/// \param imm
+/// An 8-bit immediate value specifying which elements to copy from \a a. \n
+/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
+/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
+/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
+/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
+/// Bit value assignments: \n
+/// 00: assign values from bits [79:64] of \a a. \n
+/// 01: assign values from bits [95:80] of \a a. \n
+/// 10: assign values from bits [111:96] of \a a. \n
+/// 11: assign values from bits [127:112] of \a a. \n
+/// \returns A 128-bit integer vector containing the shuffled values.
+#define _mm_shufflehi_epi16(a, imm) \
+ ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
+
+/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
+/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [16 x i8].
+/// Bits [71:64] are written to bits [7:0] of the result. \n
+/// Bits [79:72] are written to bits [23:16] of the result. \n
+/// Bits [87:80] are written to bits [39:32] of the result. \n
+/// Bits [95:88] are written to bits [55:48] of the result. \n
+/// Bits [103:96] are written to bits [71:64] of the result. \n
+/// Bits [111:104] are written to bits [87:80] of the result. \n
+/// Bits [119:112] are written to bits [103:96] of the result. \n
+/// Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+/// A 128-bit vector of [16 x i8]. \n
+/// Bits [71:64] are written to bits [15:8] of the result. \n
+/// Bits [79:72] are written to bits [31:24] of the result. \n
+/// Bits [87:80] are written to bits [47:40] of the result. \n
+/// Bits [95:88] are written to bits [63:56] of the result. \n
+/// Bits [103:96] are written to bits [79:72] of the result. \n
+/// Bits [111:104] are written to bits [95:88] of the result. \n
+/// Bits [119:112] are written to bits [111:104] of the result. \n
+/// Bits [127:120] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi8(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+}
+
+/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16].
+/// Bits [79:64] are written to bits [15:0] of the result. \n
+/// Bits [95:80] are written to bits [47:32] of the result. \n
+/// Bits [111:96] are written to bits [79:64] of the result. \n
+/// Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+/// A 128-bit vector of [8 x i16].
+/// Bits [79:64] are written to bits [31:16] of the result. \n
+/// Bits [95:80] are written to bits [63:48] of the result. \n
+/// Bits [111:96] are written to bits [95:80] of the result. \n
+/// Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
+}
+
+/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [95:64] are written to bits [31:0] of the destination. \n
+/// Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [95:64] are written to bits [64:32] of the destination. \n
+/// Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
+}
+
+/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
+/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpackhi_epi64(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
+}
+
+/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [16 x i8]. \n
+/// Bits [7:0] are written to bits [7:0] of the result. \n
+/// Bits [15:8] are written to bits [23:16] of the result. \n
+/// Bits [23:16] are written to bits [39:32] of the result. \n
+/// Bits [31:24] are written to bits [55:48] of the result. \n
+/// Bits [39:32] are written to bits [71:64] of the result. \n
+/// Bits [47:40] are written to bits [87:80] of the result. \n
+/// Bits [55:48] are written to bits [103:96] of the result. \n
+/// Bits [63:56] are written to bits [119:112] of the result.
+/// \param __b
+/// A 128-bit vector of [16 x i8].
+/// Bits [7:0] are written to bits [15:8] of the result. \n
+/// Bits [15:8] are written to bits [31:24] of the result. \n
+/// Bits [23:16] are written to bits [47:40] of the result. \n
+/// Bits [31:24] are written to bits [63:56] of the result. \n
+/// Bits [39:32] are written to bits [79:72] of the result. \n
+/// Bits [47:40] are written to bits [95:88] of the result. \n
+/// Bits [55:48] are written to bits [111:104] of the result. \n
+/// Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi8(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
+}
+
+/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
+/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
+/// [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16].
+/// Bits [15:0] are written to bits [15:0] of the result. \n
+/// Bits [31:16] are written to bits [47:32] of the result. \n
+/// Bits [47:32] are written to bits [79:64] of the result. \n
+/// Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+/// A 128-bit vector of [8 x i16].
+/// Bits [15:0] are written to bits [31:16] of the result. \n
+/// Bits [31:16] are written to bits [63:48] of the result. \n
+/// Bits [47:32] are written to bits [95:80] of the result. \n
+/// Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
+}
+
+/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [31:0] are written to bits [31:0] of the destination. \n
+/// Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+/// A 128-bit vector of [4 x i32]. \n
+/// Bits [31:0] are written to bits [64:32] of the destination. \n
+/// Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
+}
+
+/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
+/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [63:0] are written to bits [63:0] of the destination. \n
+/// \param __b
+/// A 128-bit vector of [2 x i64]. \n
+/// Bits [63:0] are written to bits [127:64] of the destination. \n
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_unpacklo_epi64(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
+}
+
+/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+/// integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector operand. The lower 64 bits are moved to the
+/// destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_movepi64_pi64(__m128i __a)
+{
+ return (__m64)__a[0];
+}
+
+/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+/// upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
+///
+/// \param __a
+/// A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+/// the operand. The upper 64 bits are assigned zeros.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_movpi64_epi64(__m64 __a)
+{
+ return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
+}
+
+/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+/// integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __a
+/// A 128-bit integer vector operand. The lower 64 bits are moved to the
+/// destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+/// the operand. The upper 64 bits are assigned zeros.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_move_epi64(__m128i __a)
+{
+ return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
+}
+
+/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
+/// [2 x double] and interleaves them into a 128-bit vector of [2 x
+/// double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpackhi_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
+}
+
+/// Unpacks the low-order 64-bit elements from two 128-bit vectors
+/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
+/// double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+/// A 128-bit vector of [2 x double]. \n
+/// Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_unpacklo_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
+}
+
+/// Extracts the sign bits of the double-precision values in the 128-bit
+/// vector of [2 x double], zero-extends the value, and writes it to the
+/// low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the values with sign bits to
+/// be extracted.
+/// \returns The sign bits from each of the double-precision elements in \a __a,
+/// written to bits [1:0]. The remaining bits are assigned values of zero.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_pd(__m128d __a)
+{
+ return __builtin_ia32_movmskpd((__v2df)__a);
+}
+
+
+/// Constructs a 128-bit floating-point vector of [2 x double] from two
+/// 128-bit vector parameters of [2 x double], using the immediate-value
+/// parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
+///
+/// \param a
+/// A 128-bit vector of [2 x double].
+/// \param b
+/// A 128-bit vector of [2 x double].
+/// \param i
+/// An 8-bit immediate value. The least significant two bits specify which
+/// elements to copy from \a a and \a b: \n
+/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
+/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
+/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
+/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
+#define _mm_shuffle_pd(a, b, i) \
+ ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
+ (int)(i)))
+
+/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+/// floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+/// bitwise pattern as the parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castpd_ps(__m128d __a)
+{
+ return (__m128)__a;
+}
+
+/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+/// parameter.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castpd_si128(__m128d __a)
+{
+ return (__m128i)__a;
+}
+
+/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+/// floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+/// bitwise pattern as the parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castps_pd(__m128 __a)
+{
+ return (__m128d)__a;
+}
+
+/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+/// parameter.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_castps_si128(__m128 __a)
+{
+ return (__m128i)__a;
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+/// bitwise pattern as the parameter.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_castsi128_ps(__m128i __a)
+{
+ return (__m128)__a;
+}
+
+/// Casts a 128-bit integer vector into a 128-bit floating-point vector
+/// of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+/// bitwise pattern as the parameter.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_castsi128_pd(__m128i __a)
+{
+ return (__m128d)__a;
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Indicates that a spin loop is being executed for the purposes of
+/// optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
+///
+void _mm_pause(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_MMX
+
+#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
+
+#define _MM_DENORMALS_ZERO_ON (0x0040U)
+#define _MM_DENORMALS_ZERO_OFF (0x0000U)
+
+#define _MM_DENORMALS_ZERO_MASK (0x0040U)
+
+#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
+
+#endif /* __EMMINTRIN_H */
--- /dev/null
+/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __ENQCMDINTRIN_H
+#define __ENQCMDINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define _DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
+
+/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
+/// data, and performs 64-byte enqueue store to memory pointed by \a __dst.
+/// This intrinsics may only be used in User mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
+///
+/// \param __dst
+/// Pointer to the destination of the enqueue store.
+/// \param __src
+/// Pointer to 64-byte command data.
+/// \returns If the command data is successfully written to \a __dst then 0 is
+/// returned. Otherwise 1 is returned.
+static __inline__ int _DEFAULT_FN_ATTRS
+_enqcmd (void *__dst, const void *__src)
+{
+ return __builtin_ia32_enqcmd(__dst, __src);
+}
+
+/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
+/// data, and performs 64-byte enqueue store to memory pointed by \a __dst
+/// This intrinsic may only be used in Privileged mode.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
+///
+/// \param __dst
+/// Pointer to the destination of the enqueue store.
+/// \param __src
+/// Pointer to 64-byte command data.
+/// \returns If the command data is successfully written to \a __dst then 0 is
+/// returned. Otherwise 1 is returned.
+static __inline__ int _DEFAULT_FN_ATTRS
+_enqcmds (void *__dst, const void *__src)
+{
+ return __builtin_ia32_enqcmds(__dst, __src);
+}
+
+#undef _DEFAULT_FN_ATTRS
+
+#endif /* __ENQCMDINTRIN_H */
--- /dev/null
+/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __IMMINTRIN_H
+#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __F16CINTRIN_H
+#define __F16CINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
+
+/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
+ * but that's because icc can emulate these without f16c using a library call.
+ * Since we don't do that let's leave these in f16cintrin.h.
+ */
+
+/// Converts a 16-bit half-precision float value into a 32-bit float
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+/// A 16-bit half-precision float value.
+/// \returns The converted 32-bit float value.
+static __inline float __DEFAULT_FN_ATTRS128
+_cvtsh_ss(unsigned short __a)
+{
+ __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
+ __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
+ return __r[0];
+}
+
+/// Converts a 32-bit single-precision float value to a 16-bit
+/// half-precision float value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _cvtss_sh(float a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+/// A 32-bit single-precision float value to be converted to a 16-bit
+/// half-precision float value.
+/// \param imm
+/// An immediate value controlling rounding using bits [2:0]: \n
+/// 000: Nearest \n
+/// 001: Down \n
+/// 010: Up \n
+/// 011: Truncate \n
+/// 1XX: Use MXCSR.RC for rounding
+/// \returns The converted 16-bit half-precision float value.
+#define _cvtss_sh(a, imm) \
+ ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
+ (imm)))[0]))
+
+/// Converts a 128-bit vector containing 32-bit float values into a
+/// 128-bit vector containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+/// A 128-bit vector containing 32-bit float values.
+/// \param imm
+/// An immediate value controlling rounding using bits [2:0]: \n
+/// 000: Nearest \n
+/// 001: Down \n
+/// 010: Up \n
+/// 011: Truncate \n
+/// 1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing converted 16-bit half-precision float
+/// values. The lower 64 bits are used to store the converted 16-bit
+/// half-precision floating-point values.
+#define _mm_cvtps_ph(a, imm) \
+ ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
+
+/// Converts a 128-bit vector containing 16-bit half-precision float
+/// values into a 128-bit vector containing 32-bit float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector containing 16-bit half-precision float values. The lower
+/// 64 bits are used in the conversion.
+/// \returns A 128-bit vector of [4 x float] containing converted float values.
+static __inline __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtph_ps(__m128i __a)
+{
+ return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+}
+
+/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
+/// containing 16-bit half-precision float values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
+///
+/// \param a
+/// A 256-bit vector containing 32-bit single-precision float values to be
+/// converted to 16-bit half-precision float values.
+/// \param imm
+/// An immediate value controlling rounding using bits [2:0]: \n
+/// 000: Nearest \n
+/// 001: Down \n
+/// 010: Up \n
+/// 011: Truncate \n
+/// 1XX: Use MXCSR.RC for rounding
+/// \returns A 128-bit vector containing the converted 16-bit half-precision
+/// float values.
+#define _mm256_cvtps_ph(a, imm) \
+ ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
+
+/// Converts a 128-bit vector containing 16-bit half-precision float
+/// values into a 256-bit vector of [8 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector containing 16-bit half-precision float values to be
+/// converted to 32-bit single-precision float values.
+/// \returns A vector of [8 x float] containing the converted 32-bit
+/// single-precision float values.
+static __inline __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtph_ps(__m128i __a)
+{
+ return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __F16CINTRIN_H */
--- /dev/null
+/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __FMA4INTRIN_H
+#define __FMA4INTRIN_H
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __FMA4INTRIN_H */
--- /dev/null
+/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FMAINTRIN_H
+#define __FMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS128
+_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __FMAINTRIN_H */
--- /dev/null
+/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __FXSRINTRIN_H
+#define __FXSRINTRIN_H
+
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr")))
+
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+/// memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave(void *__p)
+{
+ __builtin_ia32_fxsave(__p);
+}
+
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+/// memory region pointed to by the input parameter \a __p. The contents of
+/// this memory region should have been written to by a previous \c _fxsave
+/// or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor(void *__p)
+{
+ __builtin_ia32_fxrstor(__p);
+}
+
+#ifdef __x86_64__
+/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
+/// memory region pointed to by the input parameter \a __p.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxsave64(void *__p)
+{
+ __builtin_ia32_fxsave64(__p);
+}
+
+/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
+/// memory region pointed to by the input parameter \a __p. The contents of
+/// this memory region should have been written to by a previous \c _fxsave
+/// or \c _fxsave64 intrinsic.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
+///
+/// \param __p
+/// A pointer to a 512-byte memory region. The beginning of this memory
+/// region should be aligned on a 16-byte boundary.
+static __inline__ void __DEFAULT_FN_ATTRS
+_fxrstor64(void *__p)
+{
+ __builtin_ia32_fxrstor64(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __GFNIINTRIN_H
+#define __GFNIINTRIN_H
+
+/* Default attributes for simple form (no masking). */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
+
+/* Default attributes for YMM unmasked form. */
+#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+
+/* Default attributes for VLX forms. */
+#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
+
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
+ ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), \
+ (char)(I)))
+
+#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
+ ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), \
+ (char)(I)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+ (__v16qi) __B);
+}
+
+#ifdef __AVXINTRIN_H
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
+ ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
+ (__v32qi)(__m256i)(B), \
+ (char)(I)))
+
+#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
+ ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
+ (__v32qi)(__m256i)(B), \
+ (char)(I)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
+_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
+ (__v32qi) __B);
+}
+#endif /* __AVXINTRIN_H */
+
+#ifdef __AVX512BWINTRIN_H
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
+ ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), \
+ (char)(I)))
+
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+ (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
+ (__v64qi)(__m512i)(S)))
+
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+ _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
+ U, A, B, I)
+
+#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
+ ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), \
+ (char)(I)))
+
+#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+ ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
+ (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
+ (__v64qi)(__m512i)(S)))
+
+#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+ _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
+ U, A, B, I)
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
+ (__v64qi) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_selectb_512(__U,
+ (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
+ (__v64qi) __S);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
+ __U, __A, __B);
+}
+#endif /* __AVX512BWINTRIN_H */
+
+#ifdef __AVX512VLBWINTRIN_H
+#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+ (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
+ (__v16qi)(__m128i)(S)))
+
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+ _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
+ U, A, B, I)
+
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+ (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
+ (__v32qi)(__m256i)(S)))
+
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
+ _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+ U, A, B, I)
+
+#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+ ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
+ (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
+ (__v16qi)(__m128i)(S)))
+
+#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+ _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
+
+#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
+ ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
+ (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
+ (__v32qi)(__m256i)(S)))
+
+#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
+ _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
+ U, A, B, I)
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_selectb_128(__U,
+ (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
+ (__v16qi) __S);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
+_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
+ __U, __A, __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_selectb_256(__U,
+ (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
+ (__v32qi) __S);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
+_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
+ __U, __A, __B);
+}
+#endif /* __AVX512VLBWINTRIN_H */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_Y
+#undef __DEFAULT_FN_ATTRS_Z
+#undef __DEFAULT_FN_ATTRS_VL128
+#undef __DEFAULT_FN_ATTRS_VL256
+
+#endif /* __GFNIINTRIN_H */
+
--- /dev/null
+/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef __HRESETINTRIN_H
+#define __HRESETINTRIN_H
+
+#if __has_extension(gnu_asm)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("hreset")))
+
+/// Provides a hint to the processor to selectively reset the prediction
+/// history of the current logical processor specified by a 32-bit integer
+/// value \a __eax.
+///
+/// This intrinsic corresponds to the <c> HRESET </c> instruction.
+///
+/// \operation
+/// IF __eax == 0
+/// // nop
+/// ELSE
+/// FOR i := 0 to 31
+/// IF __eax[i]
+/// ResetPredictionFeature(i)
+/// FI
+/// ENDFOR
+/// FI
+/// \endoperation
+static __inline void __DEFAULT_FN_ATTRS
+_hreset(int __eax)
+{
+ __asm__ ("hreset $0" :: "a"(__eax));
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __has_extension(gnu_asm) */
+
+#endif /* __HRESETINTRIN_H */
--- /dev/null
+/* ===-------- ia32intrin.h ---------------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __IA32INTRIN_H
+#define __IA32INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+/** Find the first set bit starting from the lsb. Result is undefined if
+ * input is 0.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> BSF </c> instruction or the
+ * <c> TZCNT </c> instruction.
+ *
+ * \param __A
+ * A 32-bit integer operand.
+ * \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsfd(int __A) {
+ return __builtin_ctz(__A);
+}
+
+/** Find the first set bit starting from the msb. Result is undefined if
+ * input is 0.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> BSR </c> instruction or the
+ * <c> LZCNT </c> instruction and an <c> XOR </c>.
+ *
+ * \param __A
+ * A 32-bit integer operand.
+ * \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsrd(int __A) {
+ return 31 - __builtin_clz(__A);
+}
+
+/** Swaps the bytes in the input. Converting little endian to big endian or
+ * vice versa.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> BSWAP </c> instruction.
+ *
+ * \param __A
+ * A 32-bit integer operand.
+ * \returns A 32-bit integer containing the swapped bytes.
+ */
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bswapd(int __A) {
+ return __builtin_bswap32(__A);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_bswap(int __A) {
+ return __builtin_bswap32(__A);
+}
+
+#define _bit_scan_forward(A) __bsfd((A))
+#define _bit_scan_reverse(A) __bsrd((A))
+
+#ifdef __x86_64__
+/** Find the first set bit starting from the lsb. Result is undefined if
+ * input is 0.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> BSF </c> instruction or the
+ * <c> TZCNT </c> instruction.
+ *
+ * \param __A
+ * A 64-bit integer operand.
+ * \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsfq(long long __A) {
+ return __builtin_ctzll(__A);
+}
+
+/** Find the first set bit starting from the msb. Result is undefined if
+ * input is 0.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> BSR </c> instruction or the
+ * <c> LZCNT </c> instruction and an <c> XOR </c>.
+ *
+ * \param __A
+ * A 64-bit integer operand.
+ * \returns A 32-bit integer containing the bit number.
+ */
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__bsrq(long long __A) {
+ return 63 - __builtin_clzll(__A);
+}
+
+/** Swaps the bytes in the input. Converting little endian to big endian or
+ * vice versa.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> BSWAP </c> instruction.
+ *
+ * \param __A
+ * A 64-bit integer operand.
+ * \returns A 64-bit integer containing the swapped bytes.
+ */
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__bswapq(long long __A) {
+ return __builtin_bswap64(__A);
+}
+
+#define _bswap64(A) __bswapq((A))
+#endif
+
+/** Counts the number of bits in the source operand having a value of 1.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> POPCNT </c> instruction or a
+ * a sequence of arithmetic and logic ops to calculate it.
+ *
+ * \param __A
+ * An unsigned 32-bit integer operand.
+ * \returns A 32-bit integer containing the number of bits with value 1 in the
+ * source operand.
+ */
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+__popcntd(unsigned int __A)
+{
+ return __builtin_popcount(__A);
+}
+
+#define _popcnt32(A) __popcntd((A))
+
+#ifdef __x86_64__
+/** Counts the number of bits in the source operand having a value of 1.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> POPCNT </c> instruction or a
+ * a sequence of arithmetic and logic ops to calculate it.
+ *
+ * \param __A
+ * An unsigned 64-bit integer operand.
+ * \returns A 64-bit integer containing the number of bits with value 1 in the
+ * source operand.
+ */
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__popcntq(unsigned long long __A)
+{
+ return __builtin_popcountll(__A);
+}
+
+#define _popcnt64(A) __popcntq((A))
+#endif /* __x86_64__ */
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__readeflags(void)
+{
+ return __builtin_ia32_readeflags_u64();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writeeflags(unsigned long long __f)
+{
+ __builtin_ia32_writeeflags_u64(__f);
+}
+
+#else /* !__x86_64__ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__readeflags(void)
+{
+ return __builtin_ia32_readeflags_u32();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+__writeeflags(unsigned int __f)
+{
+ __builtin_ia32_writeeflags_u32(__f);
+}
+#endif /* !__x86_64__ */
+
+/** Cast a 32-bit float value to a 32-bit unsigned integer value
+ *
+ * \headerfile <x86intrin.h>
+ * This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction in x86_64,
+ * and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
+ *
+ * \param __A
+ * A 32-bit float value.
+ * \returns a 32-bit unsigned integer containing the converted value.
+ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
+_castf32_u32(float __A) {
+ return __builtin_bit_cast(unsigned int, __A);
+}
+
+/** Cast a 64-bit float value to a 64-bit unsigned integer value
+ *
+ * \headerfile <x86intrin.h>
+ * This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
+ * and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
+ *
+ * \param __A
+ * A 64-bit float value.
+ * \returns a 64-bit unsigned integer containing the converted value.
+ */
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
+_castf64_u64(double __A) {
+ return __builtin_bit_cast(unsigned long long, __A);
+}
+
+/** Cast a 32-bit unsigned integer value to a 32-bit float value
+ *
+ * \headerfile <x86intrin.h>
+ * This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
+ * and corresponds to the <c> FLDS </c> instruction in ia32.
+ *
+ * \param __A
+ * A 32-bit unsigned integer value.
+ * \returns a 32-bit float value containing the converted value.
+ */
+static __inline__ float __DEFAULT_FN_ATTRS_CAST
+_castu32_f32(unsigned int __A) {
+ return __builtin_bit_cast(float, __A);
+}
+
+/** Cast a 64-bit unsigned integer value to a 64-bit float value
+ *
+ * \headerfile <x86intrin.h>
+ * This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
+ * and corresponds to the <c> FLDL </c> instruction in ia32.
+ *
+ * \param __A
+ * A 64-bit unsigned integer value.
+ * \returns a 64-bit float value containing the converted value.
+ */
+static __inline__ double __DEFAULT_FN_ATTRS_CAST
+_castu64_f64(unsigned long long __A) {
+ return __builtin_bit_cast(double, __A);
+}
+
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ * unsigned char operand.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> CRC32B </c> instruction.
+ *
+ * \param __C
+ * An unsigned integer operand to add to the CRC-32C checksum of operand
+ * \a __D.
+ * \param __D
+ * An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
+ * \returns The result of adding operand \a __C to the CRC-32C checksum of
+ * operand \a __D.
+ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
+__crc32b(unsigned int __C, unsigned char __D)
+{
+ return __builtin_ia32_crc32qi(__C, __D);
+}
+
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ * unsigned short operand.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> CRC32W </c> instruction.
+ *
+ * \param __C
+ * An unsigned integer operand to add to the CRC-32C checksum of operand
+ * \a __D.
+ * \param __D
+ * An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
+ * \returns The result of adding operand \a __C to the CRC-32C checksum of
+ * operand \a __D.
+ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
+__crc32w(unsigned int __C, unsigned short __D)
+{
+ return __builtin_ia32_crc32hi(__C, __D);
+}
+
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ * second unsigned integer operand.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> CRC32D </c> instruction.
+ *
+ * \param __C
+ * An unsigned integer operand to add to the CRC-32C checksum of operand
+ * \a __D.
+ * \param __D
+ * An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
+ * \returns The result of adding operand \a __C to the CRC-32C checksum of
+ * operand \a __D.
+ */
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
+__crc32d(unsigned int __C, unsigned int __D)
+{
+ return __builtin_ia32_crc32si(__C, __D);
+}
+
+#ifdef __x86_64__
+/** Adds the unsigned integer operand to the CRC-32C checksum of the
+ * unsigned 64-bit integer operand.
+ *
+ * \headerfile <x86intrin.h>
+ *
+ * This intrinsic corresponds to the <c> CRC32Q </c> instruction.
+ *
+ * \param __C
+ * An unsigned integer operand to add to the CRC-32C checksum of operand
+ * \a __D.
+ * \param __D
+ * An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
+ * \returns The result of adding operand \a __C to the CRC-32C checksum of
+ * operand \a __D.
+ */
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
+__crc32q(unsigned long long __C, unsigned long long __D)
+{
+ return __builtin_ia32_crc32di(__C, __D);
+}
+#endif /* __x86_64__ */
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__rdpmc(int __A) {
+ return __builtin_ia32_rdpmc(__A);
+}
+
+/* __rdtscp */
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__rdtscp(unsigned int *__A) {
+ return __builtin_ia32_rdtscp(__A);
+}
+
+#define _rdtsc() __rdtsc()
+
+#define _rdpmc(A) __rdpmc(A)
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_wbinvd(void) {
+ __builtin_ia32_wbinvd();
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+__rolb(unsigned char __X, int __C) {
+ return __builtin_rotateleft8(__X, __C);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
+__rorb(unsigned char __X, int __C) {
+ return __builtin_rotateright8(__X, __C);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
+__rolw(unsigned short __X, int __C) {
+ return __builtin_rotateleft16(__X, __C);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
+__rorw(unsigned short __X, int __C) {
+ return __builtin_rotateright16(__X, __C);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
+__rold(unsigned int __X, int __C) {
+ return __builtin_rotateleft32(__X, __C);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
+__rord(unsigned int __X, int __C) {
+ return __builtin_rotateright32(__X, __C);
+}
+
+#ifdef __x86_64__
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__rolq(unsigned long long __X, int __C) {
+ return __builtin_rotateleft64(__X, __C);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
+__rorq(unsigned long long __X, int __C) {
+ return __builtin_rotateright64(__X, __C);
+}
+#endif /* __x86_64__ */
+
+#ifndef _MSC_VER
+/* These are already provided as builtins for MSVC. */
+/* Select the correct function based on the size of long. */
+#ifdef __LP64__
+#define _lrotl(a,b) __rolq((a), (b))
+#define _lrotr(a,b) __rorq((a), (b))
+#else
+#define _lrotl(a,b) __rold((a), (b))
+#define _lrotr(a,b) __rord((a), (b))
+#endif
+#define _rotl(a,b) __rold((a), (b))
+#define _rotr(a,b) __rord((a), (b))
+#endif // _MSC_VER
+
+/* These are not builtins so need to be provided in all modes. */
+#define _rotwl(a,b) __rolw((a), (b))
+#define _rotwr(a,b) __rorw((a), (b))
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CAST
+#undef __DEFAULT_FN_ATTRS_CRC32
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+
+#endif /* __IA32INTRIN_H */
--- /dev/null
+/*===---- immintrin.h - Intel intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#define __IMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <x86gprintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__MMX__)
+#include <mmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SSE__)
+#include <xmmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SSE2__)
+#include <emmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SSE3__)
+#include <pmmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__SSE4_2__) || defined(__SSE4_1__))
+#include <smmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AES__) || defined(__PCLMUL__))
+#include <wmmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__CLFLUSHOPT__)
+#include <clflushoptintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__CLWB__)
+#include <clwbintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX__)
+#include <avxintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX2__)
+#include <avx2intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__F16C__)
+#include <f16cintrin.h>
+#endif
+
+/* No feature check desired due to internal checks */
+#include <bmiintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__BMI2__)
+#include <bmi2intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__LZCNT__)
+#include <lzcntintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__POPCNT__)
+#include <popcntintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__FMA__)
+#include <fmaintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512F__)
+#include <avx512fintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512VL__)
+#include <avx512vlintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512BW__)
+#include <avx512bwintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512BITALG__)
+#include <avx512bitalgintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512CD__)
+#include <avx512cdintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512VPOPCNTDQ__)
+#include <avx512vpopcntdqintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
+#include <avx512vpopcntdqvlintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512VNNI__)
+#include <avx512vnniintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512VNNI__))
+#include <avx512vlvnniintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVXVNNI__)
+#include <avxvnniintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512DQ__)
+#include <avx512dqintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512BITALG__))
+#include <avx512vlbitalgintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512BW__))
+#include <avx512vlbwintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512CD__))
+#include <avx512vlcdintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512DQ__))
+#include <avx512vldqintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512ER__)
+#include <avx512erintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512IFMA__)
+#include <avx512ifmaintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512IFMA__) && defined(__AVX512VL__))
+#include <avx512ifmavlintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512VBMI__)
+#include <avx512vbmiintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VBMI__) && defined(__AVX512VL__))
+#include <avx512vbmivlintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512VBMI2__)
+#include <avx512vbmi2intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
+#include <avx512vlvbmi2intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512PF__)
+#include <avx512pfintrin.h>
+#endif
+
+/*
+ * FIXME: _Float16 type is legal only when HW support float16 operation.
+ * We use __AVX512FP16__ to identify if float16 is supported or not, so
+ * when float16 is not supported, the related header is not included.
+ *
+ */
+#if defined(__AVX512FP16__)
+#include <avx512fp16intrin.h>
+#endif
+
+#if defined(__AVX512FP16__) && defined(__AVX512VL__)
+#include <avx512vlfp16intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512BF16__)
+#include <avx512bf16intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512BF16__))
+#include <avx512vlbf16intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__PKU__)
+#include <pkuintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__VPCLMULQDQ__)
+#include <vpclmulqdqintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__VAES__)
+#include <vaesintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__GFNI__)
+#include <gfniintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__RDPID__)
+/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> RDPID </c> instruction.
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
+_rdpid_u32(void) {
+ return __builtin_ia32_rdpid();
+}
+#endif // __RDPID__
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__RDRND__)
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand16_step(unsigned short *__p)
+{
+ return __builtin_ia32_rdrand16_step(__p);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand32_step(unsigned int *__p)
+{
+ return __builtin_ia32_rdrand32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+ return __builtin_ia32_rdrand64_step(__p);
+}
+#endif
+#endif /* __RDRND__ */
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__FSGSBASE__)
+#ifdef __x86_64__
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u32(void)
+{
+ return __builtin_ia32_rdfsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readfsbase_u64(void)
+{
+ return __builtin_ia32_rdfsbase64();
+}
+
+static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u32(void)
+{
+ return __builtin_ia32_rdgsbase32();
+}
+
+static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_readgsbase_u64(void)
+{
+ return __builtin_ia32_rdgsbase64();
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u32(unsigned int __V)
+{
+ __builtin_ia32_wrfsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writefsbase_u64(unsigned long long __V)
+{
+ __builtin_ia32_wrfsbase64(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u32(unsigned int __V)
+{
+ __builtin_ia32_wrgsbase32(__V);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
+_writegsbase_u64(unsigned long long __V)
+{
+ __builtin_ia32_wrgsbase64(__V);
+}
+
+#endif
+#endif /* __FSGSBASE__ */
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__MOVBE__)
+
+/* The structs used below are to force the load/store to be unaligned. This
+ * is accomplished with the __packed__ attribute. The __may_alias__ prevents
+ * tbaa metadata from being generated based on the struct and the type of the
+ * field inside of it.
+ */
+
+static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_loadbe_i16(void const * __P) {
+ struct __loadu_i16 {
+ short __v;
+ } __attribute__((__packed__, __may_alias__));
+ return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_storebe_i16(void * __P, short __D) {
+ struct __storeu_i16 {
+ short __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D);
+}
+
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_loadbe_i32(void const * __P) {
+ struct __loadu_i32 {
+ int __v;
+ } __attribute__((__packed__, __may_alias__));
+ return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_storebe_i32(void * __P, int __D) {
+ struct __storeu_i32 {
+ int __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D);
+}
+
+#ifdef __x86_64__
+static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_loadbe_i64(void const * __P) {
+ struct __loadu_i64 {
+ long long __v;
+ } __attribute__((__packed__, __may_alias__));
+ return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
+_storebe_i64(void * __P, long long __D) {
+ struct __storeu_i64 {
+ long long __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D);
+}
+#endif
+#endif /* __MOVBE */
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__RTM__)
+#include <rtmintrin.h>
+#include <xtestintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SHA__)
+#include <shaintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__FXSR__)
+#include <fxsrintrin.h>
+#endif
+
+/* No feature check desired due to internal MSC_VER checks */
+#include <xsaveintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__XSAVEOPT__)
+#include <xsaveoptintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__XSAVEC__)
+#include <xsavecintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__XSAVES__)
+#include <xsavesintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SHSTK__)
+#include <cetintrin.h>
+#endif
+
+/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
+ * whereas others are also available at all times. */
+#include <adxintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__RDSEED__)
+#include <rdseedintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__WBNOINVD__)
+#include <wbnoinvdintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__CLDEMOTE__)
+#include <cldemoteintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__WAITPKG__)
+#include <waitpkgintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__MOVDIRI__) || defined(__MOVDIR64B__)
+#include <movdirintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__PCONFIG__)
+#include <pconfigintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SGX__)
+#include <sgxintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__PTWRITE__)
+#include <ptwriteintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__INVPCID__)
+#include <invpcidintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__KL__) || defined(__WIDEKL__)
+#include <keylockerintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
+#include <amxintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__AVX512VP2INTERSECT__)
+#include <avx512vp2intersectintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
+#include <avx512vlvp2intersectintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__ENQCMD__)
+#include <enqcmdintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SERIALIZE__)
+#include <serializeintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__TSXLDTRK__)
+#include <tsxldtrkintrin.h>
+#endif
+
+#if defined(_MSC_VER) && __has_extension(gnu_asm)
+/* Define the default attributes for these intrinsics */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
+ __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
+ : "+r" (_Value), "+m" (*_Target) :: "memory");
+ return _Value;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
+ __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
+ : "+r" (_Value), "+m" (*_Target) :: "memory");
+ return _Value;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
+ __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
+ : "+r" (_Value), "+m" (*_Target) :: "memory");
+ return _Value;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
+ __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
+ : "+r" (_Value), "+m" (*_Target) :: "memory");
+ return _Value;
+}
+#endif
+/*----------------------------------------------------------------------------*\
+|* Interlocked Compare Exchange HLE
+\*----------------------------------------------------------------------------*/
+#if defined(__i386__) || defined(__x86_64__)
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
+ long _Exchange, long _Comparand) {
+ __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
+ : "+a" (_Comparand), "+m" (*_Destination)
+ : "r" (_Exchange) : "memory");
+ return _Comparand;
+}
+static __inline__ long __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
+ long _Exchange, long _Comparand) {
+ __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
+ : "+a" (_Comparand), "+m" (*_Destination)
+ : "r" (_Exchange) : "memory");
+ return _Comparand;
+}
+#endif
+#if defined(__x86_64__)
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
+ __int64 _Exchange, __int64 _Comparand) {
+ __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
+ : "+a" (_Comparand), "+m" (*_Destination)
+ : "r" (_Exchange) : "memory");
+ return _Comparand;
+}
+static __inline__ __int64 __DEFAULT_FN_ATTRS
+_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
+ __int64 _Exchange, __int64 _Comparand) {
+ __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
+ : "+a" (_Comparand), "+m" (*_Destination)
+ : "r" (_Exchange) : "memory");
+ return _Comparand;
+}
+#endif
+#ifdef __cplusplus
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
+
+#endif /* __IMMINTRIN_H */
--- /dev/null
+/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __INVPCIDINTRIN_H
+#define __INVPCIDINTRIN_H
+
+static __inline__ void
+ __attribute__((__always_inline__, __nodebug__, __target__("invpcid")))
+_invpcid(unsigned int __type, void *__descriptor) {
+ __builtin_ia32_invpcid(__type, __descriptor);
+}
+
+#endif /* __INVPCIDINTRIN_H */
--- /dev/null
+/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _KEYLOCKERINTRIN_H
+#define _KEYLOCKERINTRIN_H
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__KL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("kl"),\
+ __min_vector_width__(128)))
+
+/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
+/// will assigned to EAX, whch specifies the KeySource and whether backing up
+/// the key is permitted. The 256-bit encryption key is loaded from the two
+/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
+/// loaded from the implicit operand XMM0 which assigned by __intkey.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
+///
+/// \operation
+/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
+/// GP (0)
+/// FI
+/// IF “LOADIWKEY exiting” VM execution control set
+/// VMexit
+/// FI
+/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
+/// GP (0)
+/// FI
+/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
+/// GP (0)
+/// FI
+/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
+/// GP (0)
+/// FI
+/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
+/// GP (0)
+/// FI
+/// IF (__ctl[4:1] == 0) // KeySource of 0.
+/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
+/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
+/// IWKey.IntegrityKey[127:0] := __intkey[127:0]
+/// IWKey.NoBackup := __ctl[0]
+/// IWKey.KeySource := __ctl[4:1]
+/// ZF := 0
+/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
+/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
+/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
+/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
+/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
+/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
+/// IWKey.NoBackup := __ctl[0]
+/// IWKey.KeySource := __ctl[4:1]
+/// ZF := 0
+/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded
+/// ZF := 1
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
+ __m128i __enkey_lo, __m128i __enkey_hi) {
+ __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
+}
+
+/// Wrap a 128-bit AES key from __key into a key handle and output in
+/// ((__m128i*)__h) to ((__m128i*)__h) + 2 and a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
+///
+/// \operation
+/// InputKey[127:0] := __key[127:0]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
+/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h] := Handle[127:0] // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
+ return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
+}
+
+/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
+/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
+/// a 32-bit value as return.
+/// The explicit source operand __htype specifies handle restrictions.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
+///
+/// \operation
+/// InputKey[127:0] := __key_lo[127:0]
+/// InputKey[255:128] := __key_hi[255:128]
+/// KeyMetadata[2:0] := __htype[2:0]
+/// KeyMetadata[23:3] := 0 // Reserved for future usage
+/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
+/// KeyMetadata[127:28] := 0 // Reserved for future usage
+/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
+/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
+/// dst[0] := IWKey.NoBackup
+/// dst[4:1] := IWKey.KeySource[3:0]
+/// dst[31:5] := 0
+/// MEM[__h+127:__h] := Handle[127:0] // AAD
+/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
+/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
+/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
+/// OF := 0
+/// SF := 0
+/// ZF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
+ void *__h) {
+ return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
+ (__v2di)__key_hi, __h);
+}
+
+/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[383:256] ||
+/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+/// ZF := 1
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+/// IF (Authentic == 0)
+/// ZF := 1
+/// ELSE
+/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+ return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[255:128] ||
+/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
+/// IF (IllegalHandle)
+/// ZF := 1
+/// MEM[__odata+127:__odata] := 0
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+/// IF (Authentic == 0)
+/// ZF := 1
+/// MEM[__odata+127:__odata] := 0
+/// ELSE
+/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+ return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
+/// the 128-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
+/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[383:256] ||
+/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
+/// IF (IllegalHandle)
+/// ZF := 1
+/// MEM[__odata+127:__odata] := 0
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+/// IF (Authentic == 0)
+/// ZF := 1
+/// MEM[__odata+127:__odata] := 0
+/// ELSE
+/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+ return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
+/// the 256-bit key in the handle from the __h. It stores the result in the
+/// __odata. And return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[383:256] ||
+/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
+/// IF (IllegalHandle)
+/// ZF := 1
+/// MEM[__odata+127:__odata] := 0
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+/// IF (Authentic == 0)
+/// ZF := 1
+/// MEM[__odata+127:__odata] := 0
+/// ELSE
+/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
+ return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
+ || defined(__KL__) */
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__WIDEKL__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
+ __min_vector_width__(128)))
+
+/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
+///
+/// \operation
+/// Handle := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[255:128] ||
+/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+/// IF Authentic == 0
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// FOR i := 0 to 7
+/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
+/// ENDFOR
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+ return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
+ (const __v2di *)__idata, __h);
+}
+
+/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[255:128] ||
+/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
+/// IF (IllegalHandle)
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+/// IF Authentic == 0
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// FOR i := 0 to 7
+/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
+/// ENDFOR
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+ return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
+ (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
+///
+/// \operation
+/// Handle[383:0] := MEM[__h+383:__h]
+/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[255:128] ||
+/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
+/// IF (IllegalHandle)
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
+/// IF Authentic == 0
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// FOR i := 0 to 7
+/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
+/// ENDFOR
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+ return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
+ (const __v2di *)__idata, __h);
+}
+
+/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
+/// at __h and store each resultant block back from __odata to __odata+7. And
+/// return the affected ZF flag status.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
+///
+/// \operation
+/// Handle[511:0] := MEM[__h+511:__h]
+/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
+/// (Handle[127:0] AND (CPL > 0)) ||
+/// Handle[255:128] ||
+/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
+/// If (IllegalHandle)
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
+/// IF Authentic == 0
+/// ZF := 1
+/// FOR i := 0 to 7
+/// __odata[i] := 0
+/// ENDFOR
+/// ELSE
+/// FOR i := 0 to 7
+/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
+/// ENDFOR
+/// ZF := 0
+/// FI
+/// FI
+/// dst := ZF
+/// OF := 0
+/// SF := 0
+/// AF := 0
+/// PF := 0
+/// CF := 0
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
+ return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
+ (const __v2di *)__idata, __h);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
+ || defined(__WIDEKL__) */
+
+#endif /* _KEYLOCKERINTRIN_H */
--- /dev/null
+/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LWPINTRIN_H
+#define __LWPINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
+
+/// Parses the LWPCB at the specified address and enables
+/// profiling if valid.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
+///
+/// \param __addr
+/// Address to the new Lightweight Profiling Control Block (LWPCB). If the
+/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
+/// Lightweight Profiling.
+static __inline__ void __DEFAULT_FN_ATTRS
+__llwpcb (void *__addr)
+{
+ __builtin_ia32_llwpcb(__addr);
+}
+
+/// Flushes the LWP state to memory and returns the address of the LWPCB.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
+///
+/// \return
+/// Address to the current Lightweight Profiling Control Block (LWPCB).
+/// If LWP is not currently enabled, returns NULL.
+static __inline__ void* __DEFAULT_FN_ATTRS
+__slwpcb (void)
+{
+ return __builtin_ia32_slwpcb();
+}
+
+/// Inserts programmed event record into the LWP event ring buffer
+/// and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+/// A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+/// A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+/// the event record overwrites the last record in the buffer, the MissedEvents
+/// counter in the LWPCB is incremented, the head pointer is not advanced, and
+/// 1 is returned. Otherwise 0 is returned.
+#define __lwpins32(DATA2, DATA1, FLAGS) \
+ (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+ (unsigned int) (FLAGS)))
+
+/// Decrements the LWP programmed value sample event counter. If the result is
+/// negative, inserts an event record into the LWP event ring buffer in memory
+/// and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
+/// \param DATA1
+/// A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+/// A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval32(DATA2, DATA1, FLAGS) \
+ (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
+ (unsigned int) (FLAGS)))
+
+#ifdef __x86_64__
+
+/// Inserts programmed event record into the LWP event ring buffer
+/// and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
+///
+/// \param DATA2
+/// A 64-bit value is inserted into the 64-bit Data2 field.
+/// \param DATA1
+/// A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+/// A 32-bit immediate value is inserted into the 32-bit Flags field.
+/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
+/// the event record overwrites the last record in the buffer, the MissedEvents
+/// counter in the LWPCB is incremented, the head pointer is not advanced, and
+/// 1 is returned. Otherwise 0 is returned.
+#define __lwpins64(DATA2, DATA1, FLAGS) \
+ (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+ (unsigned int) (FLAGS)))
+
+/// Decrements the LWP programmed value sample event counter. If the result is
+/// negative, inserts an event record into the LWP event ring buffer in memory
+/// and advances the ring buffer pointer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
+///
+/// \param DATA2
+/// A 64-bit value is and inserted into the 64-bit Data2 field.
+/// \param DATA1
+/// A 32-bit value is inserted into the 32-bit Data1 field.
+/// \param FLAGS
+/// A 32-bit immediate value is inserted into the 32-bit Flags field.
+#define __lwpval64(DATA2, DATA1, FLAGS) \
+ (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
+ (unsigned int) (FLAGS)))
+
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LWPINTRIN_H */
--- /dev/null
+/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __LZCNTINTRIN_H
+#define __LZCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
+
+#ifndef _MSC_VER
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 16-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of leading zero
+/// bits in the operand.
+#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X))
+#endif // _MSC_VER
+
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+/// bits in the operand.
+/// \see _lzcnt_u32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__lzcnt32(unsigned int __X)
+{
+ return __builtin_ia32_lzcnt_u32(__X);
+}
+
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 32-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of leading zero
+/// bits in the operand.
+/// \see __lzcnt32
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_lzcnt_u32(unsigned int __X)
+{
+ return __builtin_ia32_lzcnt_u32(__X);
+}
+
+#ifdef __x86_64__
+#ifndef _MSC_VER
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+/// bits in the operand.
+/// \see _lzcnt_u64
+#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X))
+#endif // _MSC_VER
+
+/// Counts the number of leading zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LZCNT instruction.
+///
+/// \param __X
+/// An unsigned 64-bit integer whose leading zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of leading zero
+/// bits in the operand.
+/// \see __lzcnt64
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+_lzcnt_u64(unsigned long long __X)
+{
+ return __builtin_ia32_lzcnt_u64(__X);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __LZCNTINTRIN_H */
--- /dev/null
+/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+typedef float __v2sf __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64)))
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
+_m_femms(void) {
+ __builtin_ia32_femms();
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pavgusb(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2id(__m64 __m) {
+ return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfacc(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfadd(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpeq(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpge(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfcmpgt(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmax(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmin(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfmul(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcp(__m64 __m) {
+ return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit1(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrcpit2(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrt(__m64 __m) {
+ return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsub(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfsubr(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fd(__m64 __m) {
+ return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pmulhrw(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/* Handle the 3dnowa instructions here. */
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64)))
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pf2iw(__m64 __m) {
+ return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfnacc(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pfpnacc(__m64 __m1, __m64 __m2) {
+ return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pi2fw(__m64 __m) {
+ return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsf(__m64 __m) {
+ return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
+}
+
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_m_pswapdsi(__m64 __m) {
+ return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MM_MALLOC_H
+#define __MM_MALLOC_H
+
+#include <stdlib.h>
+
+#ifdef _WIN32
+#include <malloc.h>
+#else
+#ifndef __cplusplus
+extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#else
+// Some systems (e.g. those with GNU libc) declare posix_memalign with an
+// exception specifier. Via an "egregious workaround" in
+// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
+// redeclaration of glibc's declaration.
+extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
+#endif
+#endif
+
+#if !(defined(_WIN32) && defined(_mm_malloc))
+static __inline__ void *__attribute__((__always_inline__, __nodebug__,
+ __malloc__))
+_mm_malloc(size_t __size, size_t __align)
+{
+ if (__align == 1) {
+ return malloc(__size);
+ }
+
+ if (!(__align & (__align - 1)) && __align < sizeof(void *))
+ __align = sizeof(void *);
+
+ void *__mallocedMemory;
+#if defined(__MINGW32__)
+ __mallocedMemory = __mingw_aligned_malloc(__size, __align);
+#elif defined(_WIN32)
+ __mallocedMemory = _aligned_malloc(__size, __align);
+#else
+ if (posix_memalign(&__mallocedMemory, __align, __size))
+ return 0;
+#endif
+
+ return __mallocedMemory;
+}
+
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_mm_free(void *__p)
+{
+#if defined(__MINGW32__)
+ __mingw_aligned_free(__p);
+#elif defined(_WIN32)
+ _aligned_free(__p);
+#else
+ free(__p);
+#endif
+}
+#endif
+
+#endif /* __MM_MALLOC_H */
--- /dev/null
+/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __MMINTRIN_H
+#define __MMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
+
+typedef long long __v1di __attribute__((__vector_size__(8)));
+typedef int __v2si __attribute__((__vector_size__(8)));
+typedef short __v4hi __attribute__((__vector_size__(8)));
+typedef char __v8qi __attribute__((__vector_size__(8)));
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
+
+/// Clears the MMX state by setting the state of the x87 stack registers
+/// to empty.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> EMMS </c> instruction.
+///
+static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
+_mm_empty(void)
+{
+ __builtin_ia32_emms();
+}
+
+/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
+/// value of the 32-bit integer parameter and setting the upper 32 bits to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVD </c> instruction.
+///
+/// \param __i
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
+/// parameter. The upper 32 bits are set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_si64(int __i)
+{
+ return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
+}
+
+/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
+/// signed integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVD </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector.
+/// \returns A 32-bit signed integer value containing the lower 32 bits of the
+/// parameter.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtsi64_si32(__m64 __m)
+{
+ return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
+}
+
+/// Casts a 64-bit signed integer value into a 64-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
+///
+/// \param __i
+/// A 64-bit signed integer.
+/// \returns A 64-bit integer vector containing the same bitwise pattern as the
+/// parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_m64(long long __i)
+{
+ return (__m64)__i;
+}
+
+/// Casts a 64-bit integer vector into a 64-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector.
+/// \returns A 64-bit signed integer containing the same bitwise pattern as the
+/// parameter.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtm64_si64(__m64 __m)
+{
+ return (long long)__m;
+}
+
+/// Converts 16-bit signed integers from both 64-bit integer vector
+/// parameters of [4 x i16] into 8-bit signed integer values, and constructs
+/// a 64-bit integer vector of [8 x i8] as the result. Positive values
+/// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
+/// are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+/// 16-bit signed integer and is converted to an 8-bit signed integer with
+/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
+/// Negative values less than 0x80 are saturated to 0x80. The converted
+/// [4 x i8] values are written to the lower 32 bits of the result.
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+/// 16-bit signed integer and is converted to an 8-bit signed integer with
+/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
+/// Negative values less than 0x80 are saturated to 0x80. The converted
+/// [4 x i8] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Converts 32-bit signed integers from both 64-bit integer vector
+/// parameters of [2 x i32] into 16-bit signed integer values, and constructs
+/// a 64-bit integer vector of [4 x i16] as the result. Positive values
+/// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
+/// 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+/// 32-bit signed integer and is converted to a 16-bit signed integer with
+/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+/// Negative values less than 0x8000 are saturated to 0x8000. The converted
+/// [2 x i16] values are written to the lower 32 bits of the result.
+/// \param __m2
+/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
+/// 32-bit signed integer and is converted to a 16-bit signed integer with
+/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+/// Negative values less than 0x8000 are saturated to 0x8000. The converted
+/// [2 x i16] values are written to the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pi32(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
+}
+
+/// Converts 16-bit signed integers from both 64-bit integer vector
+/// parameters of [4 x i16] into 8-bit unsigned integer values, and
+/// constructs a 64-bit integer vector of [8 x i8] as the result. Values
+/// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
+/// to 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
+/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0 are saturated to 0. The converted [4 x i8] values are written to
+/// the lower 32 bits of the result.
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
+/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
+/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+/// than 0 are saturated to 0. The converted [4 x i8] values are written to
+/// the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the converted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_packs_pu16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
+/// and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8]. \n
+/// Bits [39:32] are written to bits [7:0] of the result. \n
+/// Bits [47:40] are written to bits [23:16] of the result. \n
+/// Bits [55:48] are written to bits [39:32] of the result. \n
+/// Bits [63:56] are written to bits [55:48] of the result.
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8].
+/// Bits [39:32] are written to bits [15:8] of the result. \n
+/// Bits [47:40] are written to bits [31:24] of the result. \n
+/// Bits [55:48] are written to bits [47:40] of the result. \n
+/// Bits [63:56] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of
+/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// Bits [47:32] are written to bits [15:0] of the result. \n
+/// Bits [63:48] are written to bits [47:32] of the result.
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// Bits [47:32] are written to bits [31:16] of the result. \n
+/// Bits [63:48] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Unpacks the upper 32 bits from two 64-bit integer vectors of
+/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+/// the lower 32 bits of the result.
+/// \param __m2
+/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
+/// the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
+}
+
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
+/// and interleaves them into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8].
+/// Bits [7:0] are written to bits [7:0] of the result. \n
+/// Bits [15:8] are written to bits [23:16] of the result. \n
+/// Bits [23:16] are written to bits [39:32] of the result. \n
+/// Bits [31:24] are written to bits [55:48] of the result.
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8].
+/// Bits [7:0] are written to bits [15:8] of the result. \n
+/// Bits [15:8] are written to bits [31:24] of the result. \n
+/// Bits [23:16] are written to bits [47:40] of the result. \n
+/// Bits [31:24] are written to bits [63:56] of the result.
+/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of
+/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// Bits [15:0] are written to bits [15:0] of the result. \n
+/// Bits [31:16] are written to bits [47:32] of the result.
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// Bits [15:0] are written to bits [31:16] of the result. \n
+/// Bits [31:16] are written to bits [63:48] of the result.
+/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Unpacks the lower 32 bits from two 64-bit integer vectors of
+/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+/// the lower 32 bits of the result.
+/// \param __m2
+/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
+/// the upper 32 bits of the result.
+/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
+}
+
+/// Adds each 8-bit integer element of the first 64-bit integer vector
+/// of [8 x i8] to the corresponding 8-bit integer element of the second
+/// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
+/// packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8].
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
+/// parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Adds each 16-bit integer element of the first 64-bit integer vector
+/// of [4 x i16] to the corresponding 16-bit integer element of the second
+/// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
+/// packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
+/// parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Adds each 32-bit integer element of the first 64-bit integer vector
+/// of [2 x i32] to the corresponding 32-bit integer element of the second
+/// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
+/// packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDD </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [2 x i32].
+/// \param __m2
+/// A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
+/// parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_add_pi32(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// Adds each 8-bit signed integer element of the first 64-bit integer
+/// vector of [8 x i8] to the corresponding 8-bit signed integer element of
+/// the second 64-bit integer vector of [8 x i8]. Positive sums greater than
+/// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
+/// 0x80. The results are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8].
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
+/// of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Adds each 16-bit signed integer element of the first 64-bit integer
+/// vector of [4 x i16] to the corresponding 16-bit signed integer element of
+/// the second 64-bit integer vector of [4 x i16]. Positive sums greater than
+/// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
+/// saturated to 0x8000. The results are packed into a 64-bit integer vector
+/// of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
+/// of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Adds each 8-bit unsigned integer element of the first 64-bit integer
+/// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
+/// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
+/// saturated to 0xFF. The results are packed into a 64-bit integer vector of
+/// [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8].
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+/// unsigned sums of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Adds each 16-bit unsigned integer element of the first 64-bit integer
+/// vector of [4 x i16] to the corresponding 16-bit unsigned integer element
+/// of the second 64-bit integer vector of [4 x i16]. Sums greater than
+/// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
+/// integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+/// unsigned sums of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_adds_pu16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Subtracts each 8-bit integer element of the second 64-bit integer
+/// vector of [8 x i8] from the corresponding 8-bit integer element of the
+/// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
+/// are packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
+/// both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Subtracts each 16-bit integer element of the second 64-bit integer
+/// vector of [4 x i16] from the corresponding 16-bit integer element of the
+/// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
+/// results are packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
+/// both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Subtracts each 32-bit integer element of the second 64-bit integer
+/// vector of [2 x i32] from the corresponding 32-bit integer element of the
+/// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
+/// results are packed into a 64-bit integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [2 x i32] containing the minuends.
+/// \param __m2
+/// A 64-bit integer vector of [2 x i32] containing the subtrahends.
+/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
+/// both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sub_pi32(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// Subtracts each 8-bit signed integer element of the second 64-bit
+/// integer vector of [8 x i8] from the corresponding 8-bit signed integer
+/// element of the first 64-bit integer vector of [8 x i8]. Positive results
+/// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
+/// are saturated to 0x80. The results are packed into a 64-bit integer
+/// vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+/// differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Subtracts each 16-bit signed integer element of the second 64-bit
+/// integer vector of [4 x i16] from the corresponding 16-bit signed integer
+/// element of the first 64-bit integer vector of [4 x i16]. Positive results
+/// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
+/// 0x8000 are saturated to 0x8000. The results are packed into a 64-bit
+/// integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+/// differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Subtracts each 8-bit unsigned integer element of the second 64-bit
+/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
+/// element of the first 64-bit integer vector of [8 x i8].
+///
+/// If an element of the first vector is less than the corresponding element
+/// of the second vector, the result is saturated to 0. The results are
+/// packed into a 64-bit integer vector of [8 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8] containing the minuends.
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
+/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
+/// differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Subtracts each 16-bit unsigned integer element of the second 64-bit
+/// integer vector of [4 x i16] from the corresponding 16-bit unsigned
+/// integer element of the first 64-bit integer vector of [4 x i16].
+///
+/// If an element of the first vector is less than the corresponding element
+/// of the second vector, the result is saturated to 0. The results are
+/// packed into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16] containing the minuends.
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
+/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
+/// differences of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_subs_pu16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Multiplies each 16-bit signed integer element of the first 64-bit
+/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
+/// element of the second 64-bit integer vector of [4 x i16] and get four
+/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
+/// The lower 32 bits of these two sums are packed into a 64-bit integer
+/// vector of [2 x i32].
+///
+/// For example, bits [15:0] of both parameters are multiplied, bits [31:16]
+/// of both parameters are multiplied, and the sum of both results is written
+/// to bits [31:0] of the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
+/// products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_madd_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Multiplies each 16-bit signed integer element of the first 64-bit
+/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
+/// element of the second 64-bit integer vector of [4 x i16]. Packs the upper
+/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
+/// of the products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Multiplies each 16-bit signed integer element of the first 64-bit
+/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
+/// element of the second 64-bit integer vector of [4 x i16]. Packs the lower
+/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
+/// of the products of both parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_mullo_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Left-shifts each 16-bit signed integer element of the first
+/// parameter, which is a 64-bit integer vector of [4 x i16], by the number
+/// of bits specified by the second parameter, which is a 64-bit integer. The
+/// lower 16 bits of the results are packed into a 64-bit integer vector of
+/// [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [4 x i16].
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+/// values. If \a __count is greater or equal to 16, the result is set to all
+/// 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi16(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
+}
+
+/// Left-shifts each 16-bit signed integer element of a 64-bit integer
+/// vector of [4 x i16] by the number of bits specified by a 32-bit integer.
+/// The lower 16 bits of the results are packed into a 64-bit integer vector
+/// of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [4 x i16].
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
+/// values. If \a __count is greater or equal to 16, the result is set to all
+/// 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi16(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
+}
+
+/// Left-shifts each 32-bit signed integer element of the first
+/// parameter, which is a 64-bit integer vector of [2 x i32], by the number
+/// of bits specified by the second parameter, which is a 64-bit integer. The
+/// lower 32 bits of the results are packed into a 64-bit integer vector of
+/// [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [2 x i32].
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+/// values. If \a __count is greater or equal to 32, the result is set to all
+/// 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_pi32(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
+}
+
+/// Left-shifts each 32-bit signed integer element of a 64-bit integer
+/// vector of [2 x i32] by the number of bits specified by a 32-bit integer.
+/// The lower 32 bits of the results are packed into a 64-bit integer vector
+/// of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [2 x i32].
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
+/// values. If \a __count is greater or equal to 32, the result is set to all
+/// 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_pi32(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
+}
+
+/// Left-shifts the first 64-bit integer parameter by the number of bits
+/// specified by the second 64-bit integer parameter. The lower 64 bits of
+/// result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+/// \a __count is greater or equal to 64, the result is set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sll_si64(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
+}
+
+/// Left-shifts the first parameter, which is a 64-bit integer, by the
+/// number of bits specified by the second parameter, which is a 32-bit
+/// integer. The lower 64 bits of result are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the left-shifted value. If
+/// \a __count is greater or equal to 64, the result is set to 0.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_slli_si64(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
+}
+
+/// Right-shifts each 16-bit integer element of the first parameter,
+/// which is a 64-bit integer vector of [4 x i16], by the number of bits
+/// specified by the second parameter, which is a 64-bit integer.
+///
+/// High-order bits are filled with the sign bit of the initial value of each
+/// 16-bit element. The 16-bit results are packed into a 64-bit integer
+/// vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [4 x i16].
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi16(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
+}
+
+/// Right-shifts each 16-bit integer element of a 64-bit integer vector
+/// of [4 x i16] by the number of bits specified by a 32-bit integer.
+///
+/// High-order bits are filled with the sign bit of the initial value of each
+/// 16-bit element. The 16-bit results are packed into a 64-bit integer
+/// vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [4 x i16].
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi16(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
+}
+
+/// Right-shifts each 32-bit integer element of the first parameter,
+/// which is a 64-bit integer vector of [2 x i32], by the number of bits
+/// specified by the second parameter, which is a 64-bit integer.
+///
+/// High-order bits are filled with the sign bit of the initial value of each
+/// 32-bit element. The 32-bit results are packed into a 64-bit integer
+/// vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [2 x i32].
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_sra_pi32(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
+}
+
+/// Right-shifts each 32-bit integer element of a 64-bit integer vector
+/// of [2 x i32] by the number of bits specified by a 32-bit integer.
+///
+/// High-order bits are filled with the sign bit of the initial value of each
+/// 32-bit element. The 32-bit results are packed into a 64-bit integer
+/// vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [2 x i32].
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srai_pi32(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
+}
+
+/// Right-shifts each 16-bit integer element of the first parameter,
+/// which is a 64-bit integer vector of [4 x i16], by the number of bits
+/// specified by the second parameter, which is a 64-bit integer.
+///
+/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
+/// integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [4 x i16].
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi16(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
+}
+
+/// Right-shifts each 16-bit integer element of a 64-bit integer vector
+/// of [4 x i16] by the number of bits specified by a 32-bit integer.
+///
+/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
+/// integer vector of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [4 x i16].
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi16(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
+}
+
+/// Right-shifts each 32-bit integer element of the first parameter,
+/// which is a 64-bit integer vector of [2 x i32], by the number of bits
+/// specified by the second parameter, which is a 64-bit integer.
+///
+/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
+/// integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [2 x i32].
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_pi32(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
+}
+
+/// Right-shifts each 32-bit integer element of a 64-bit integer vector
+/// of [2 x i32] by the number of bits specified by a 32-bit integer.
+///
+/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
+/// integer vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector of [2 x i32].
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_pi32(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
+}
+
+/// Right-shifts the first 64-bit integer parameter by the number of bits
+/// specified by the second 64-bit integer parameter.
+///
+/// High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \returns A 64-bit integer vector containing the right-shifted value.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srl_si64(__m64 __m, __m64 __count)
+{
+ return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
+}
+
+/// Right-shifts the first parameter, which is a 64-bit integer, by the
+/// number of bits specified by the second parameter, which is a 32-bit
+/// integer.
+///
+/// High-order bits are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
+///
+/// \param __m
+/// A 64-bit integer vector interpreted as a single 64-bit integer.
+/// \param __count
+/// A 32-bit integer value.
+/// \returns A 64-bit integer vector containing the right-shifted value.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_srli_si64(__m64 __m, int __count)
+{
+ return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
+}
+
+/// Performs a bitwise AND of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAND </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector.
+/// \param __m2
+/// A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of both
+/// parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_and_si64(__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
+}
+
+/// Performs a bitwise NOT of the first 64-bit integer vector, and then
+/// performs a bitwise AND of the intermediate result and the second 64-bit
+/// integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PANDN </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector. The one's complement of this parameter is used
+/// in the bitwise AND.
+/// \param __m2
+/// A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise AND of the second
+/// parameter and the one's complement of the first parameter.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_andnot_si64(__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
+}
+
+/// Performs a bitwise OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POR </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector.
+/// \param __m2
+/// A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise OR of both
+/// parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_or_si64(__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
+}
+
+/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector.
+/// \param __m2
+/// A 64-bit integer vector.
+/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
+/// parameters.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_xor_si64(__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
+}
+
+/// Compares the 8-bit integer elements of two 64-bit integer vectors of
+/// [8 x i8] to determine if the element of the first vector is equal to the
+/// corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8].
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+/// results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Compares the 16-bit integer elements of two 64-bit integer vectors of
+/// [4 x i16] to determine if the element of the first vector is equal to the
+/// corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+/// results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Compares the 32-bit integer elements of two 64-bit integer vectors of
+/// [2 x i32] to determine if the element of the first vector is equal to the
+/// corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [2 x i32].
+/// \param __m2
+/// A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+/// results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// Compares the 8-bit integer elements of two 64-bit integer vectors of
+/// [8 x i8] to determine if the element of the first vector is greater than
+/// the corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [8 x i8].
+/// \param __m2
+/// A 64-bit integer vector of [8 x i8].
+/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
+/// results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
+}
+
+/// Compares the 16-bit integer elements of two 64-bit integer vectors of
+/// [4 x i16] to determine if the element of the first vector is greater than
+/// the corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [4 x i16].
+/// \param __m2
+/// A 64-bit integer vector of [4 x i16].
+/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
+/// results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
+}
+
+/// Compares the 32-bit integer elements of two 64-bit integer vectors of
+/// [2 x i32] to determine if the element of the first vector is greater than
+/// the corresponding element of the second vector.
+///
+/// The comparison yields 0 for false, 0xFFFFFFFF for true.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
+///
+/// \param __m1
+/// A 64-bit integer vector of [2 x i32].
+/// \param __m2
+/// A 64-bit integer vector of [2 x i32].
+/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
+/// results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
+{
+ return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
+}
+
+/// Constructs a 64-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PXOR </c> instruction.
+///
+/// \returns An initialized 64-bit integer vector with all elements set to zero.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setzero_si64(void)
+{
+ return __extension__ (__m64){ 0LL };
+}
+
+/// Constructs a 64-bit integer vector initialized with the specified
+/// 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i1
+/// A 32-bit integer value used to initialize the upper 32 bits of the
+/// result.
+/// \param __i0
+/// A 32-bit integer value used to initialize the lower 32 bits of the
+/// result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi32(int __i1, int __i0)
+{
+ return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
+}
+
+/// Constructs a 64-bit integer vector initialized with the specified
+/// 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __s3
+/// A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \param __s2
+/// A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __s1
+/// A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __s0
+/// A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
+{
+ return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
+}
+
+/// Constructs a 64-bit integer vector initialized with the specified
+/// 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b7
+/// An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \param __b6
+/// An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b5
+/// An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b4
+/// An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b3
+/// An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b2
+/// An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b1
+/// An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b0
+/// An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
+ char __b1, char __b0)
+{
+ return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
+ __b4, __b5, __b6, __b7);
+}
+
+/// Constructs a 64-bit integer vector of [2 x i32], with each of the
+/// 32-bit integer vector elements set to the specified 32-bit integer
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i
+/// A 32-bit integer value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 64-bit integer vector of [2 x i32].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi32(int __i)
+{
+ return _mm_set_pi32(__i, __i);
+}
+
+/// Constructs a 64-bit integer vector of [4 x i16], with each of the
+/// 16-bit integer vector elements set to the specified 16-bit integer
+/// value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w
+/// A 16-bit integer value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 64-bit integer vector of [4 x i16].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi16(short __w)
+{
+ return _mm_set_pi16(__w, __w, __w, __w);
+}
+
+/// Constructs a 64-bit integer vector of [8 x i8], with each of the
+/// 8-bit integer vector elements set to the specified 8-bit integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b
+/// An 8-bit integer value used to initialize each vector element of the
+/// result.
+/// \returns An initialized 64-bit integer vector of [8 x i8].
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_set1_pi8(char __b)
+{
+ return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
+}
+
+/// Constructs a 64-bit integer vector, initialized in reverse order with
+/// the specified 32-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __i0
+/// A 32-bit integer value used to initialize the lower 32 bits of the
+/// result.
+/// \param __i1
+/// A 32-bit integer value used to initialize the upper 32 bits of the
+/// result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi32(int __i0, int __i1)
+{
+ return _mm_set_pi32(__i1, __i0);
+}
+
+/// Constructs a 64-bit integer vector, initialized in reverse order with
+/// the specified 16-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __w0
+/// A 16-bit integer value used to initialize bits [15:0] of the result.
+/// \param __w1
+/// A 16-bit integer value used to initialize bits [31:16] of the result.
+/// \param __w2
+/// A 16-bit integer value used to initialize bits [47:32] of the result.
+/// \param __w3
+/// A 16-bit integer value used to initialize bits [63:48] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
+{
+ return _mm_set_pi16(__w3, __w2, __w1, __w0);
+}
+
+/// Constructs a 64-bit integer vector, initialized in reverse order with
+/// the specified 8-bit integer values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __b0
+/// An 8-bit integer value used to initialize bits [7:0] of the result.
+/// \param __b1
+/// An 8-bit integer value used to initialize bits [15:8] of the result.
+/// \param __b2
+/// An 8-bit integer value used to initialize bits [23:16] of the result.
+/// \param __b3
+/// An 8-bit integer value used to initialize bits [31:24] of the result.
+/// \param __b4
+/// An 8-bit integer value used to initialize bits [39:32] of the result.
+/// \param __b5
+/// An 8-bit integer value used to initialize bits [47:40] of the result.
+/// \param __b6
+/// An 8-bit integer value used to initialize bits [55:48] of the result.
+/// \param __b7
+/// An 8-bit integer value used to initialize bits [63:56] of the result.
+/// \returns An initialized 64-bit integer vector.
+static __inline__ __m64 __DEFAULT_FN_ATTRS
+_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
+ char __b6, char __b7)
+{
+ return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+/* Aliases for compatibility. */
+#define _m_empty _mm_empty
+#define _m_from_int _mm_cvtsi32_si64
+#define _m_from_int64 _mm_cvtsi64_m64
+#define _m_to_int _mm_cvtsi64_si32
+#define _m_to_int64 _mm_cvtm64_si64
+#define _m_packsswb _mm_packs_pi16
+#define _m_packssdw _mm_packs_pi32
+#define _m_packuswb _mm_packs_pu16
+#define _m_punpckhbw _mm_unpackhi_pi8
+#define _m_punpckhwd _mm_unpackhi_pi16
+#define _m_punpckhdq _mm_unpackhi_pi32
+#define _m_punpcklbw _mm_unpacklo_pi8
+#define _m_punpcklwd _mm_unpacklo_pi16
+#define _m_punpckldq _mm_unpacklo_pi32
+#define _m_paddb _mm_add_pi8
+#define _m_paddw _mm_add_pi16
+#define _m_paddd _mm_add_pi32
+#define _m_paddsb _mm_adds_pi8
+#define _m_paddsw _mm_adds_pi16
+#define _m_paddusb _mm_adds_pu8
+#define _m_paddusw _mm_adds_pu16
+#define _m_psubb _mm_sub_pi8
+#define _m_psubw _mm_sub_pi16
+#define _m_psubd _mm_sub_pi32
+#define _m_psubsb _mm_subs_pi8
+#define _m_psubsw _mm_subs_pi16
+#define _m_psubusb _mm_subs_pu8
+#define _m_psubusw _mm_subs_pu16
+#define _m_pmaddwd _mm_madd_pi16
+#define _m_pmulhw _mm_mulhi_pi16
+#define _m_pmullw _mm_mullo_pi16
+#define _m_psllw _mm_sll_pi16
+#define _m_psllwi _mm_slli_pi16
+#define _m_pslld _mm_sll_pi32
+#define _m_pslldi _mm_slli_pi32
+#define _m_psllq _mm_sll_si64
+#define _m_psllqi _mm_slli_si64
+#define _m_psraw _mm_sra_pi16
+#define _m_psrawi _mm_srai_pi16
+#define _m_psrad _mm_sra_pi32
+#define _m_psradi _mm_srai_pi32
+#define _m_psrlw _mm_srl_pi16
+#define _m_psrlwi _mm_srli_pi16
+#define _m_psrld _mm_srl_pi32
+#define _m_psrldi _mm_srli_pi32
+#define _m_psrlq _mm_srl_si64
+#define _m_psrlqi _mm_srli_si64
+#define _m_pand _mm_and_si64
+#define _m_pandn _mm_andnot_si64
+#define _m_por _mm_or_si64
+#define _m_pxor _mm_xor_si64
+#define _m_pcmpeqb _mm_cmpeq_pi8
+#define _m_pcmpeqw _mm_cmpeq_pi16
+#define _m_pcmpeqd _mm_cmpeq_pi32
+#define _m_pcmpgtb _mm_cmpgt_pi8
+#define _m_pcmpgtw _mm_cmpgt_pi16
+#define _m_pcmpgtd _mm_cmpgt_pi32
+
+#endif /* __MMINTRIN_H */
+
--- /dev/null
+/*===------------------------- movdirintrin.h ------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _MOVDIRINTRIN_H
+#define _MOVDIRINTRIN_H
+
+/* Move doubleword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
+_directstoreu_u32 (void *__dst, unsigned int __value)
+{
+ __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
+}
+
+#ifdef __x86_64__
+
+/* Move quadword as direct store */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
+_directstoreu_u64 (void *__dst, unsigned long __value)
+{
+ __builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
+}
+
+#endif /* __x86_64__ */
+
+/*
+ * movdir64b - Move 64 bytes as direct store.
+ * The destination must be 64 byte aligned, and the store is atomic.
+ * The source address has no alignment requirement, and the load from
+ * the source address is not atomic.
+ */
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__, __target__("movdir64b")))
+_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
+{
+ __builtin_ia32_movdir64b(__dst, __src);
+}
+
+#endif /* _MOVDIRINTRIN_H */
--- /dev/null
+/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __MWAITXINTRIN_H
+#define __MWAITXINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx")))
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
+{
+ __builtin_ia32_monitorx(__p, __extensions, __hints);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
+{
+ __builtin_ia32_mwaitx(__extensions, __hints, __clock);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __MWAITXINTRIN_H */
--- /dev/null
+/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __NMMINTRIN_H
+#define __NMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
+ just include it now then. */
+#include <smmintrin.h>
+#endif /* __NMMINTRIN_H */
--- /dev/null
+/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PCONFIGINTRIN_H
+#define __PCONFIGINTRIN_H
+
+#define __PCONFIG_KEY_PROGRAM 0x00000001
+
+#if __has_extension(gnu_asm)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("pconfig")))
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+ unsigned int __result;
+ __asm__ ("pconfig"
+ : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+ : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+ : "cc");
+ return __result;
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __has_extension(gnu_asm) */
+
+#endif
--- /dev/null
+/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __PKUINTRIN_H
+#define __PKUINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_rdpkru_u32(void)
+{
+ return __builtin_ia32_rdpkru();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_wrpkru(unsigned int __val)
+{
+ __builtin_ia32_wrpkru(__val);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PMMINTRIN_H
+#define __PMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <emmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
+
+/// Loads data from an unaligned memory location to elements in a 128-bit
+/// vector.
+///
+/// If the address of the data is not 16-byte aligned, the instruction may
+/// read two adjacent aligned blocks of memory to retrieve the requested
+/// data.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit integer vector containing integer values.
+/// \returns A 128-bit vector containing the moved values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_lddqu_si128(__m128i const *__p)
+{
+ return (__m128i)__builtin_ia32_lddqu((char const *)__p);
+}
+
+/// Adds the even-indexed values and subtracts the odd-indexed values of
+/// two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the left source operand.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing the right source operand.
+/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
+/// differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_addsub_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in two
+/// 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The horizontal sums of the values are stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The horizontal sums of the values are stored in the upper bits of the
+/// destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
+/// both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hadd_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in two
+/// 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The horizontal differences between the values are stored in the lower
+/// bits of the destination.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The horizontal differences between the values are stored in the upper
+/// bits of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the horizontal
+/// differences of both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_hsub_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Moves and duplicates odd-indexed values from a 128-bit vector
+/// of [4 x float] to float values stored in a 128-bit vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
+/// the destination. \n
+/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
+/// destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+/// values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehdup_ps(__m128 __a)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
+}
+
+/// Duplicates even-indexed values from a 128-bit vector of
+/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] \n
+/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
+/// the destination. \n
+/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
+/// destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
+/// values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_moveldup_ps(__m128 __a)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
+}
+
+/// Adds the even-indexed values and subtracts the odd-indexed values of
+/// two 128-bit vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing the left source operand.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing the right source operand.
+/// \returns A 128-bit vector of [2 x double] containing the alternating sums
+/// and differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_addsub_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Horizontally adds the pairs of values contained in two 128-bit
+/// vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// The horizontal sum of the values is stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// The horizontal sum of the values is stored in the upper bits of the
+/// destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
+/// both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hadd_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Horizontally subtracts the pairs of values contained in two 128-bit
+/// vectors of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// The horizontal difference of the values is stored in the lower bits of
+/// the destination.
+/// \param __b
+/// A 128-bit vector of [2 x double] containing one of the source operands.
+/// The horizontal difference of the values is stored in the upper bits of
+/// the destination.
+/// \returns A 128-bit vector of [2 x double] containing the horizontal
+/// differences of both operands.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_hsub_pd(__m128d __a, __m128d __b)
+{
+ return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
+}
+
+/// Moves and duplicates one double-precision value to double-precision
+/// values stored in a 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_loaddup_pd(double const *dp);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param dp
+/// A pointer to a double-precision value to be moved and duplicated.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+/// duplicated values.
+#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
+
+/// Moves and duplicates the double-precision value in the lower bits of
+/// a 128-bit vector of [2 x double] to double-precision values stored in a
+/// 128-bit vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
+/// [127:64] and [63:0] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the moved and
+/// duplicated values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_movedup_pd(__m128d __a)
+{
+ return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
+}
+
+/// Establishes a linear address memory range to be monitored and puts
+/// the processor in the monitor event pending state. Data stored in the
+/// monitored address range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
+///
+/// \param __p
+/// The memory range to be monitored. The size of the range is determined by
+/// CPUID function 0000_0005h.
+/// \param __extensions
+/// Optional extensions for the monitoring state.
+/// \param __hints
+/// Optional hints for the monitoring state.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
+{
+ __builtin_ia32_monitor(__p, __extensions, __hints);
+}
+
+/// Used with the MONITOR instruction to wait while the processor is in
+/// the monitor event pending state. Data stored in the monitored address
+/// range causes the processor to exit the pending state.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
+///
+/// \param __extensions
+/// Optional extensions for the monitoring state, which may vary by
+/// processor.
+/// \param __hints
+/// Optional hints for the monitoring state, which may vary by processor.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_mwait(unsigned __extensions, unsigned __hints)
+{
+ __builtin_ia32_mwait(__extensions, __hints);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PMMINTRIN_H */
--- /dev/null
+/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __POPCNTINTRIN_H
+#define __POPCNTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
+
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
+#else
+#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
+#endif
+
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+/// An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+/// source operand.
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_popcnt_u32(unsigned int __A)
+{
+ return __builtin_popcount(__A);
+}
+
+#ifdef __x86_64__
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
+///
+/// \param __A
+/// An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+/// source operand.
+static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_popcnt_u64(unsigned long long __A)
+{
+ return __builtin_popcountll(__A);
+}
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_CONSTEXPR
+
+#endif /* __POPCNTINTRIN_H */
--- /dev/null
+/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
+#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef __PRFCHWINTRIN_H
+#define __PRFCHWINTRIN_H
+
+/// Loads a memory sequence containing the specified memory address into
+/// all data cache levels. The cache-coherency state is set to exclusive.
+/// Data can be read from and written to the cache line without additional
+/// delay.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
+///
+/// \param __P
+/// A pointer specifying the memory address to be prefetched.
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetch(void *__P)
+{
+ __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+/// Loads a memory sequence containing the specified memory address into
+/// the L1 data cache and sets the cache-coherency to modified. This
+/// provides a hint to the processor that the cache line will be modified.
+/// It is intended for use when the cache line will be written to shortly
+/// after the prefetch is performed.
+///
+/// Note that the effect of this intrinsic is dependent on the processor
+/// implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHW instruction.
+///
+/// \param __P
+/// A pointer specifying the memory address to be prefetched.
+static __inline__ void __attribute__((__always_inline__, __nodebug__))
+_m_prefetchw(volatile const void *__P)
+{
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+ __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
+#pragma clang diagnostic pop
+}
+
+#endif /* __PRFCHWINTRIN_H */
--- /dev/null
+/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __PTWRITEINTRIN_H
+#define __PTWRITEINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("ptwrite")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite32(unsigned int __value) {
+ __builtin_ia32_ptwrite32(__value);
+}
+
+#ifdef __x86_64__
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_ptwrite64(unsigned long long __value) {
+ __builtin_ia32_ptwrite64(__value);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PTWRITEINTRIN_H */
--- /dev/null
+/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __RDSEEDINTRIN_H
+#define __RDSEEDINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed16_step(unsigned short *__p)
+{
+ return __builtin_ia32_rdseed16_step(__p);
+}
+
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed32_step(unsigned int *__p)
+{
+ return __builtin_ia32_rdseed32_step(__p);
+}
+
+#ifdef __x86_64__
+static __inline__ int __DEFAULT_FN_ATTRS
+_rdseed64_step(unsigned long long *__p)
+{
+ return __builtin_ia32_rdseed64_step(__p);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RDSEEDINTRIN_H */
--- /dev/null
+/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __RTMINTRIN_H
+#define __RTMINTRIN_H
+
+#define _XBEGIN_STARTED (~0u)
+#define _XABORT_EXPLICIT (1 << 0)
+#define _XABORT_RETRY (1 << 1)
+#define _XABORT_CONFLICT (1 << 2)
+#define _XABORT_CAPACITY (1 << 3)
+#define _XABORT_DEBUG (1 << 4)
+#define _XABORT_NESTED (1 << 5)
+#define _XABORT_CODE(x) (((x) >> 24) & 0xFF)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+_xbegin(void)
+{
+ return __builtin_ia32_xbegin();
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xend(void)
+{
+ __builtin_ia32_xend();
+}
+
+#define _xabort(imm) __builtin_ia32_xabort((imm))
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __RTMINTRIN_H */
--- /dev/null
+/*===--------------- serializeintrin.h - serialize intrinsics --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SERIALIZEINTRIN_H
+#define __SERIALIZEINTRIN_H
+
+/// Serialize instruction fetch and execution.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SERIALIZE </c> instruction.
+///
+static __inline__ void
+__attribute__((__always_inline__, __nodebug__, __target__("serialize")))
+_serialize (void)
+{
+ __builtin_ia32_serialize ();
+}
+
+#endif /* __SERIALIZEINTRIN_H */
--- /dev/null
+/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __SGXINTRIN_H
+#define __SGXINTRIN_H
+
+#if __has_extension(gnu_asm)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("sgx")))
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+ unsigned int __result;
+ __asm__ ("enclu"
+ : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+ : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+ : "cc");
+ return __result;
+}
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+ unsigned int __result;
+ __asm__ ("encls"
+ : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+ : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+ : "cc");
+ return __result;
+}
+
+static __inline unsigned int __DEFAULT_FN_ATTRS
+_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
+{
+ unsigned int __result;
+ __asm__ ("enclv"
+ : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
+ : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
+ : "cc");
+ return __result;
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __has_extension(gnu_asm) */
+
+#endif
--- /dev/null
+/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __SHAINTRIN_H
+#define __SHAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128)))
+
+#define _mm_sha1rnds4_epu32(V1, V2, M) \
+ __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
+{
+ return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __SHAINTRIN_H */
--- /dev/null
+/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __SMMINTRIN_H
+#define __SMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <tmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
+
+/* SSE4 Rounding macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NO_EXC 0x08
+
+#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
+#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
+#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
+#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
+#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
+#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
+
+/// Rounds up each element of the 128-bit vector of [4 x float] to an
+/// integer and returns the rounded values in a 128-bit vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_ceil_ps(__m128 X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float] values to be rounded up.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
+#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
+
+/// Rounds up each element of the 128-bit vector of [2 x double] to an
+/// integer and returns the rounded values in a 128-bit vector of
+/// [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_ceil_pd(__m128d X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [2 x double] values to be rounded up.
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
+#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
+
+/// Copies three upper elements of the first 128-bit vector operand to
+/// the corresponding three upper elements of the 128-bit result vector of
+/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
+/// operand to an integer and copies it to the lowest element of the 128-bit
+/// result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+/// copied to the corresponding bits of the result.
+/// \param Y
+/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+/// rounded up to the nearest integer and copied to the corresponding bits
+/// of the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+/// values.
+#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
+
+/// Copies the upper element of the first 128-bit vector operand to the
+/// corresponding upper element of the 128-bit result vector of [2 x double].
+/// Rounds up the lower element of the second 128-bit vector operand to an
+/// integer and copies it to the lower element of the 128-bit result vector
+/// of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+/// copied to the corresponding bits of the result.
+/// \param Y
+/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+/// rounded up to the nearest integer and copied to the corresponding bits
+/// of the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+/// values.
+#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
+
+/// Rounds down each element of the 128-bit vector of [4 x float] to an
+/// an integer and returns the rounded values in a 128-bit vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_floor_ps(__m128 X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float] values to be rounded down.
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
+#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
+
+/// Rounds down each element of the 128-bit vector of [2 x double] to an
+/// integer and returns the rounded values in a 128-bit vector of
+/// [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_floor_pd(__m128d X);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [2 x double].
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
+#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
+
+/// Copies three upper elements of the first 128-bit vector operand to
+/// the corresponding three upper elements of the 128-bit result vector of
+/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
+/// operand to an integer and copies it to the lowest element of the 128-bit
+/// result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+/// copied to the corresponding bits of the result.
+/// \param Y
+/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+/// rounded down to the nearest integer and copied to the corresponding bits
+/// of the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+/// values.
+#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
+
+/// Copies the upper element of the first 128-bit vector operand to the
+/// corresponding upper element of the 128-bit result vector of [2 x double].
+/// Rounds down the lower element of the second 128-bit vector operand to an
+/// integer and copies it to the lower element of the 128-bit result vector
+/// of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+/// copied to the corresponding bits of the result.
+/// \param Y
+/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+/// rounded down to the nearest integer and copied to the corresponding bits
+/// of the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+/// values.
+#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
+
+/// Rounds each element of the 128-bit vector of [4 x float] to an
+/// integer value according to the rounding control specified by the second
+/// argument and returns the rounded values in a 128-bit vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_round_ps(__m128 X, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float].
+/// \param M
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used \n
+/// 1: The PE field is not updated \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M \n
+/// 1: Use the current MXCSR setting \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest \n
+/// 01: Downward (toward negative infinity) \n
+/// 10: Upward (toward positive infinity) \n
+/// 11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the rounded values.
+#define _mm_round_ps(X, M) \
+ ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
+
+/// Copies three upper elements of the first 128-bit vector operand to
+/// the corresponding three upper elements of the 128-bit result vector of
+/// [4 x float]. Rounds the lowest element of the second 128-bit vector
+/// operand to an integer value according to the rounding control specified
+/// by the third argument and copies it to the lowest element of the 128-bit
+/// result vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
+/// copied to the corresponding bits of the result.
+/// \param Y
+/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
+/// rounded to the nearest integer using the specified rounding control and
+/// copied to the corresponding bits of the result.
+/// \param M
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used \n
+/// 1: The PE field is not updated \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M \n
+/// 1: Use the current MXCSR setting \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest \n
+/// 01: Downward (toward negative infinity) \n
+/// 10: Upward (toward positive infinity) \n
+/// 11: Truncated
+/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
+/// values.
+#define _mm_round_ss(X, Y, M) \
+ ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (M)))
+
+/// Rounds each element of the 128-bit vector of [2 x double] to an
+/// integer value according to the rounding control specified by the second
+/// argument and returns the rounded values in a 128-bit vector of
+/// [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_round_pd(__m128d X, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [2 x double].
+/// \param M
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used \n
+/// 1: The PE field is not updated \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M \n
+/// 1: Use the current MXCSR setting \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest \n
+/// 01: Downward (toward negative infinity) \n
+/// 10: Upward (toward positive infinity) \n
+/// 11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the rounded values.
+#define _mm_round_pd(X, M) \
+ ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
+
+/// Copies the upper element of the first 128-bit vector operand to the
+/// corresponding upper element of the 128-bit result vector of [2 x double].
+/// Rounds the lower element of the second 128-bit vector operand to an
+/// integer value according to the rounding control specified by the third
+/// argument and copies it to the lower element of the 128-bit result vector
+/// of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
+/// copied to the corresponding bits of the result.
+/// \param Y
+/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
+/// rounded to the nearest integer using the specified rounding control and
+/// copied to the corresponding bits of the result.
+/// \param M
+/// An integer value that specifies the rounding operation. \n
+/// Bits [7:4] are reserved. \n
+/// Bit [3] is a precision exception value: \n
+/// 0: A normal PE exception is used \n
+/// 1: The PE field is not updated \n
+/// Bit [2] is the rounding control source: \n
+/// 0: Use bits [1:0] of \a M \n
+/// 1: Use the current MXCSR setting \n
+/// Bits [1:0] contain the rounding control definition: \n
+/// 00: Nearest \n
+/// 01: Downward (toward negative infinity) \n
+/// 10: Upward (toward positive infinity) \n
+/// 11: Truncated
+/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
+/// values.
+#define _mm_round_sd(X, Y, M) \
+ ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (M)))
+
+/* SSE4 Packed Blending Intrinsics. */
+/// Returns a 128-bit vector of [2 x double] where the values are
+/// selected from either the first or second operand as specified by the
+/// third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
+///
+/// \param V1
+/// A 128-bit vector of [2 x double].
+/// \param V2
+/// A 128-bit vector of [2 x double].
+/// \param M
+/// An immediate integer operand, with mask bits [1:0] specifying how the
+/// values are to be copied. The position of the mask bit corresponds to the
+/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
+/// element in operand \a V1 is copied to the same position in the result.
+/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
+/// is copied to the same position in the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+#define _mm_blend_pd(V1, V2, M) \
+ ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+ (__v2df)(__m128d)(V2), (int)(M)))
+
+/// Returns a 128-bit vector of [4 x float] where the values are selected
+/// from either the first or second operand as specified by the third
+/// operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
+///
+/// \param V1
+/// A 128-bit vector of [4 x float].
+/// \param V2
+/// A 128-bit vector of [4 x float].
+/// \param M
+/// An immediate integer operand, with mask bits [3:0] specifying how the
+/// values are to be copied. The position of the mask bit corresponds to the
+/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
+/// element in operand \a V1 is copied to the same position in the result.
+/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
+/// is copied to the same position in the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+#define _mm_blend_ps(V1, V2, M) \
+ ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+ (__v4sf)(__m128)(V2), (int)(M)))
+
+/// Returns a 128-bit vector of [2 x double] where the values are
+/// selected from either the first or second operand as specified by the
+/// third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [2 x double].
+/// \param __V2
+/// A 128-bit vector of [2 x double].
+/// \param __M
+/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
+/// values are to be copied. The position of the mask bit corresponds to the
+/// most significant bit of a copied value. When a mask bit is 0, the
+/// corresponding 64-bit element in operand \a __V1 is copied to the same
+/// position in the result. When a mask bit is 1, the corresponding 64-bit
+/// element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [2 x double] containing the copied values.
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
+{
+ return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
+ (__v2df)__M);
+}
+
+/// Returns a 128-bit vector of [4 x float] where the values are
+/// selected from either the first or second operand as specified by the
+/// third operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [4 x float].
+/// \param __V2
+/// A 128-bit vector of [4 x float].
+/// \param __M
+/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
+/// how the values are to be copied. The position of the mask bit corresponds
+/// to the most significant bit of a copied value. When a mask bit is 0, the
+/// corresponding 32-bit element in operand \a __V1 is copied to the same
+/// position in the result. When a mask bit is 1, the corresponding 32-bit
+/// element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [4 x float] containing the copied values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
+{
+ return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
+ (__v4sf)__M);
+}
+
+/// Returns a 128-bit vector of [16 x i8] where the values are selected
+/// from either of the first or second operand as specified by the third
+/// operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [16 x i8].
+/// \param __V2
+/// A 128-bit vector of [16 x i8].
+/// \param __M
+/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
+/// how the values are to be copied. The position of the mask bit corresponds
+/// to the most significant bit of a copied value. When a mask bit is 0, the
+/// corresponding 8-bit element in operand \a __V1 is copied to the same
+/// position in the result. When a mask bit is 1, the corresponding 8-bit
+/// element in operand \a __V2 is copied to the same position in the result.
+/// \returns A 128-bit vector of [16 x i8] containing the copied values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
+{
+ return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
+ (__v16qi)__M);
+}
+
+/// Returns a 128-bit vector of [8 x i16] where the values are selected
+/// from either of the first or second operand as specified by the third
+/// operand, the control mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
+///
+/// \param V1
+/// A 128-bit vector of [8 x i16].
+/// \param V2
+/// A 128-bit vector of [8 x i16].
+/// \param M
+/// An immediate integer operand, with mask bits [7:0] specifying how the
+/// values are to be copied. The position of the mask bit corresponds to the
+/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
+/// element in operand \a V1 is copied to the same position in the result.
+/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
+/// is copied to the same position in the result.
+/// \returns A 128-bit vector of [8 x i16] containing the copied values.
+#define _mm_blend_epi16(V1, V2, M) \
+ ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+ (__v8hi)(__m128i)(V2), (int)(M)))
+
+/* SSE4 Dword Multiply Instructions. */
+/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
+/// and returns the lower 32 bits of the each product in a 128-bit vector of
+/// [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
+///
+/// \param __V1
+/// A 128-bit integer vector.
+/// \param __V2
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the products of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
+{
+ return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
+}
+
+/// Multiplies corresponding even-indexed elements of two 128-bit
+/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
+/// containing the products.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [4 x i32].
+/// \param __V2
+/// A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [2 x i64] containing the products of both
+/// operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mul_epi32 (__m128i __V1, __m128i __V2)
+{
+ return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Floating Point Dot Product Instructions. */
+/// Computes the dot product of the two 128-bit vectors of [4 x float]
+/// and returns it in the elements of the 128-bit result vector of
+/// [4 x float].
+///
+/// The immediate integer operand controls which input elements
+/// will contribute to the dot product, and where the final results are
+/// returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float].
+/// \param Y
+/// A 128-bit vector of [4 x float].
+/// \param M
+/// An immediate integer operand. Mask bits [7:4] determine which elements
+/// of the input vectors are used, with bit [4] corresponding to the lowest
+/// element and bit [7] corresponding to the highest element of each [4 x
+/// float] vector. If a bit is set, the corresponding elements from the two
+/// input vectors are used as an input for dot product; otherwise that input
+/// is treated as zero. Bits [3:0] determine which elements of the result
+/// will receive a copy of the final dot product, with bit [0] corresponding
+/// to the lowest element and bit [3] corresponding to the highest element of
+/// each [4 x float] subvector. If a bit is set, the dot product is returned
+/// in the corresponding element; otherwise that element is set to zero.
+/// \returns A 128-bit vector of [4 x float] containing the dot product.
+#define _mm_dp_ps(X, Y, M) \
+ ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (M)))
+
+/// Computes the dot product of the two 128-bit vectors of [2 x double]
+/// and returns it in the elements of the 128-bit result vector of
+/// [2 x double].
+///
+/// The immediate integer operand controls which input
+/// elements will contribute to the dot product, and where the final results
+/// are returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [2 x double].
+/// \param Y
+/// A 128-bit vector of [2 x double].
+/// \param M
+/// An immediate integer operand. Mask bits [5:4] determine which elements
+/// of the input vectors are used, with bit [4] corresponding to the lowest
+/// element and bit [5] corresponding to the highest element of each of [2 x
+/// double] vector. If a bit is set, the corresponding elements from the two
+/// input vectors are used as an input for dot product; otherwise that input
+/// is treated as zero. Bits [1:0] determine which elements of the result
+/// will receive a copy of the final dot product, with bit [0] corresponding
+/// to the lowest element and bit [1] corresponding to the highest element of
+/// each [2 x double] vector. If a bit is set, the dot product is returned in
+/// the corresponding element; otherwise that element is set to zero.
+#define _mm_dp_pd(X, Y, M) \
+ ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (M)))
+
+/* SSE4 Streaming Load Hint Instruction. */
+/// Loads integer values from a 128-bit aligned memory location to a
+/// 128-bit integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
+///
+/// \param __V
+/// A pointer to a 128-bit aligned memory location that contains the integer
+/// values.
+/// \returns A 128-bit integer vector containing the data stored at the
+/// specified memory location.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_stream_load_si128 (__m128i const *__V)
+{
+ return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
+}
+
+/* SSE4 Packed Integer Min/Max Instructions. */
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
+/// of the two values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [16 x i8].
+/// \param __V2
+/// A 128-bit vector of [16 x i8]
+/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi8 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
+#else
+ return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
+#endif
+}
+
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
+/// greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [16 x i8].
+/// \param __V2
+/// A 128-bit vector of [16 x i8].
+/// \returns A 128-bit vector of [16 x i8] containing the greater values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi8 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
+#else
+ return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
+#endif
+}
+
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
+/// value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [8 x u16].
+/// \param __V2
+/// A 128-bit vector of [8 x u16].
+/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu16 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
+#else
+ return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
+#endif
+}
+
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
+/// greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [8 x u16].
+/// \param __V2
+/// A 128-bit vector of [8 x u16].
+/// \returns A 128-bit vector of [8 x u16] containing the greater values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu16 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
+#else
+ return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
+#endif
+}
+
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
+/// value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [4 x i32].
+/// \param __V2
+/// A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epi32 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
+#else
+ return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
+#endif
+}
+
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
+/// greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [4 x i32].
+/// \param __V2
+/// A 128-bit vector of [4 x i32].
+/// \returns A 128-bit vector of [4 x i32] containing the greater values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epi32 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
+#else
+ return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
+#endif
+}
+
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
+/// value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [4 x u32].
+/// \param __V2
+/// A 128-bit vector of [4 x u32].
+/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_min_epu32 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
+#else
+ return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
+#endif
+}
+
+/// Compares the corresponding elements of two 128-bit vectors of
+/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
+/// greater value of the two.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [4 x u32].
+/// \param __V2
+/// A 128-bit vector of [4 x u32].
+/// \returns A 128-bit vector of [4 x u32] containing the greater values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_max_epu32 (__m128i __V1, __m128i __V2)
+{
+#if (__clang_major__ < 14)
+ return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
+#else
+ return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
+#endif
+}
+
+/* SSE4 Insertion and Extraction from XMM Register Instructions. */
+/// Takes the first argument \a X and inserts an element from the second
+/// argument \a Y as selected by the third argument \a N. That result then
+/// has elements zeroed out also as selected by the third argument \a N. The
+/// resulting 128-bit vector of [4 x float] is then returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
+///
+/// \param X
+/// A 128-bit vector source operand of [4 x float]. With the exception of
+/// those bits in the result copied from parameter \a Y and zeroed by bits
+/// [3:0] of \a N, all bits from this parameter are copied to the result.
+/// \param Y
+/// A 128-bit vector source operand of [4 x float]. One single-precision
+/// floating-point element from this source, as determined by the immediate
+/// parameter, is copied to the result.
+/// \param N
+/// Specifies which bits from operand \a Y will be copied, which bits in the
+/// result they will be be copied to, and which bits in the result will be
+/// cleared. The following assignments are made: \n
+/// Bits [7:6] specify the bits to copy from operand \a Y: \n
+/// 00: Selects bits [31:0] from operand \a Y. \n
+/// 01: Selects bits [63:32] from operand \a Y. \n
+/// 10: Selects bits [95:64] from operand \a Y. \n
+/// 11: Selects bits [127:96] from operand \a Y. \n
+/// Bits [5:4] specify the bits in the result to which the selected bits
+/// from operand \a Y are copied: \n
+/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
+/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
+/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
+/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
+/// Bits[3:0]: If any of these bits are set, the corresponding result
+/// element is cleared.
+/// \returns A 128-bit vector of [4 x float] containing the copied
+/// single-precision floating point elements from the operands.
+#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
+
+/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
+/// returns it, using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_ps(__m128 X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
+/// instruction.
+///
+/// \param X
+/// A 128-bit vector of [4 x float].
+/// \param N
+/// An immediate value. Bits [1:0] determines which bits from the argument
+/// \a X are extracted and returned: \n
+/// 00: Bits [31:0] of parameter \a X are returned. \n
+/// 01: Bits [63:32] of parameter \a X are returned. \n
+/// 10: Bits [95:64] of parameter \a X are returned. \n
+/// 11: Bits [127:96] of parameter \a X are returned.
+/// \returns A 32-bit integer containing the extracted 32 bits of float data.
+#define _mm_extract_ps(X, N) \
+ __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
+
+/* Miscellaneous insert and extract macros. */
+/* Extract a single-precision float from X at index N into D. */
+#define _MM_EXTRACT_FLOAT(D, X, N) \
+ do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
+
+/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
+ an index suitable for _mm_insert_ps. */
+#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
+
+/* Extract a float from X at index N into the first index of the return. */
+#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
+ _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
+
+/* Insert int into packed integer array at index. */
+/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
+/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
+/// of an integer parameter \a I into an offset specified by the immediate
+/// value parameter \a N.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
+///
+/// \param X
+/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
+/// result and then one of the sixteen elements in the result vector is
+/// replaced by the lower 8 bits of \a I.
+/// \param I
+/// An integer. The lower 8 bits of this operand are written to the result
+/// beginning at the offset specified by \a N.
+/// \param N
+/// An immediate value. Bits [3:0] specify the bit offset in the result at
+/// which the lower 8 bits of \a I are written. \n
+/// 0000: Bits [7:0] of the result are used for insertion. \n
+/// 0001: Bits [15:8] of the result are used for insertion. \n
+/// 0010: Bits [23:16] of the result are used for insertion. \n
+/// 0011: Bits [31:24] of the result are used for insertion. \n
+/// 0100: Bits [39:32] of the result are used for insertion. \n
+/// 0101: Bits [47:40] of the result are used for insertion. \n
+/// 0110: Bits [55:48] of the result are used for insertion. \n
+/// 0111: Bits [63:56] of the result are used for insertion. \n
+/// 1000: Bits [71:64] of the result are used for insertion. \n
+/// 1001: Bits [79:72] of the result are used for insertion. \n
+/// 1010: Bits [87:80] of the result are used for insertion. \n
+/// 1011: Bits [95:88] of the result are used for insertion. \n
+/// 1100: Bits [103:96] of the result are used for insertion. \n
+/// 1101: Bits [111:104] of the result are used for insertion. \n
+/// 1110: Bits [119:112] of the result are used for insertion. \n
+/// 1111: Bits [127:120] of the result are used for insertion.
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi8(X, I, N) \
+ ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+ (int)(I), (int)(N)))
+
+/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
+/// the 128-bit integer vector parameter, and then inserting the 32-bit
+/// integer parameter \a I at the offset specified by the immediate value
+/// parameter \a N.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
+///
+/// \param X
+/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
+/// result and then one of the four elements in the result vector is
+/// replaced by \a I.
+/// \param I
+/// A 32-bit integer that is written to the result beginning at the offset
+/// specified by \a N.
+/// \param N
+/// An immediate value. Bits [1:0] specify the bit offset in the result at
+/// which the integer \a I is written. \n
+/// 00: Bits [31:0] of the result are used for insertion. \n
+/// 01: Bits [63:32] of the result are used for insertion. \n
+/// 10: Bits [95:64] of the result are used for insertion. \n
+/// 11: Bits [127:96] of the result are used for insertion.
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi32(X, I, N) \
+ ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+ (int)(I), (int)(N)))
+
+#ifdef __x86_64__
+/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
+/// the 128-bit integer vector parameter, and then inserting the 64-bit
+/// integer parameter \a I, using the immediate value parameter \a N as an
+/// insertion location selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
+///
+/// \param X
+/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
+/// result and then one of the two elements in the result vector is replaced
+/// by \a I.
+/// \param I
+/// A 64-bit integer that is written to the result beginning at the offset
+/// specified by \a N.
+/// \param N
+/// An immediate value. Bit [0] specifies the bit offset in the result at
+/// which the integer \a I is written. \n
+/// 0: Bits [63:0] of the result are used for insertion. \n
+/// 1: Bits [127:64] of the result are used for insertion. \n
+/// \returns A 128-bit integer vector containing the constructed values.
+#define _mm_insert_epi64(X, I, N) \
+ ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+ (long long)(I), (int)(N)))
+#endif /* __x86_64__ */
+
+/* Extract int from packed integer array at index. This returns the element
+ * as a zero extended value, so it is unsigned.
+ */
+/// Extracts an 8-bit element from the 128-bit integer vector of
+/// [16 x i8], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_epi8(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
+///
+/// \param X
+/// A 128-bit integer vector.
+/// \param N
+/// An immediate value. Bits [3:0] specify which 8-bit vector element from
+/// the argument \a X to extract and copy to the result. \n
+/// 0000: Bits [7:0] of parameter \a X are extracted. \n
+/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
+/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
+/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
+/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
+/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
+/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
+/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
+/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
+/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
+/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
+/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
+/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
+/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
+/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
+/// 1111: Bits [127:120] of the parameter \a X are extracted.
+/// \returns An unsigned integer, whose lower 8 bits are selected from the
+/// 128-bit integer vector parameter and the remaining bits are assigned
+/// zeros.
+#define _mm_extract_epi8(X, N) \
+ ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+ (int)(N)))
+
+/// Extracts a 32-bit element from the 128-bit integer vector of
+/// [4 x i32], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_epi32(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
+///
+/// \param X
+/// A 128-bit integer vector.
+/// \param N
+/// An immediate value. Bits [1:0] specify which 32-bit vector element from
+/// the argument \a X to extract and copy to the result. \n
+/// 00: Bits [31:0] of the parameter \a X are extracted. \n
+/// 01: Bits [63:32] of the parameter \a X are extracted. \n
+/// 10: Bits [95:64] of the parameter \a X are extracted. \n
+/// 11: Bits [127:96] of the parameter \a X are exracted.
+/// \returns An integer, whose lower 32 bits are selected from the 128-bit
+/// integer vector parameter and the remaining bits are assigned zeros.
+#define _mm_extract_epi32(X, N) \
+ ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
+
+#ifdef __x86_64__
+/// Extracts a 64-bit element from the 128-bit integer vector of
+/// [2 x i64], using the immediate value parameter \a N as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// long long _mm_extract_epi64(__m128i X, const int N);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
+///
+/// \param X
+/// A 128-bit integer vector.
+/// \param N
+/// An immediate value. Bit [0] specifies which 64-bit vector element from
+/// the argument \a X to return. \n
+/// 0: Bits [63:0] are returned. \n
+/// 1: Bits [127:64] are returned. \n
+/// \returns A 64-bit integer.
+#define _mm_extract_epi64(X, N) \
+ ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
+#endif /* __x86_64 */
+
+/* SSE4 128-bit Packed Integer Comparisons. */
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+/// A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+/// A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testz_si128(__m128i __M, __m128i __V)
+{
+ return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+/// A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+/// A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testc_si128(__m128i __M, __m128i __V)
+{
+ return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param __M
+/// A 128-bit integer vector containing the bits to be tested.
+/// \param __V
+/// A 128-bit integer vector selecting which bits to test in operand \a __M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones;
+/// FALSE otherwise.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_testnzc_si128(__m128i __M, __m128i __V)
+{
+ return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
+}
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_all_ones(__m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param V
+/// A 128-bit integer vector containing the bits to be tested.
+/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
+/// otherwise.
+#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
+
+/// Tests whether the specified bits in a 128-bit integer vector are
+/// neither all zeros nor all ones.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param M
+/// A 128-bit integer vector containing the bits to be tested.
+/// \param V
+/// A 128-bit integer vector selecting which bits to test in operand \a M.
+/// \returns TRUE if the specified bits are neither all zeros nor all ones;
+/// FALSE otherwise.
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
+
+/// Tests whether the specified bits in a 128-bit integer vector are all
+/// zeros.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_test_all_zeros(__m128i M, __m128i V);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
+///
+/// \param M
+/// A 128-bit integer vector containing the bits to be tested.
+/// \param V
+/// A 128-bit integer vector selecting which bits to test in operand \a M.
+/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+/* SSE4 64-bit Packed Integer Comparisons. */
+/// Compares each of the corresponding 64-bit values of the 128-bit
+/// integer vectors for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
+///
+/// \param __V1
+/// A 128-bit integer vector.
+/// \param __V2
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
+{
+ return (__m128i)((__v2di)__V1 == (__v2di)__V2);
+}
+
+/* SSE4 Packed Integer Sign-Extension. */
+/// Sign-extends each of the lower eight 8-bit integer elements of a
+/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
+/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
+/// are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
+/// extended to 16-bit values.
+/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi16(__m128i __V)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+/// Sign-extends each of the lower four 8-bit integer elements of a
+/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
+/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+/// sign-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi32(__m128i __V)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
+}
+
+/// Sign-extends each of the lower two 8-bit integer elements of a
+/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
+/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+/// sign-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi8_epi64(__m128i __V)
+{
+ /* This function always performs a signed extension, but __v16qi is a char
+ which may be signed or unsigned, so use __v16qs. */
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
+}
+
+/// Sign-extends each of the lower four 16-bit integer elements of a
+/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
+/// a 128-bit vector of [4 x i32]. The upper four elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+/// sign-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi32(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
+}
+
+/// Sign-extends each of the lower two 16-bit integer elements of a
+/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
+/// a 128-bit vector of [2 x i64]. The upper six elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+/// sign-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi16_epi64(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
+}
+
+/// Sign-extends each of the lower two 32-bit integer elements of a
+/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
+/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
+/// are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+/// sign-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepi32_epi64(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Packed Integer Zero-Extension. */
+/// Zero-extends each of the lower eight 8-bit integer elements of a
+/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
+/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
+/// are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
+/// zero-extended to 16-bit values.
+/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi16(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
+}
+
+/// Zero-extends each of the lower four 8-bit integer elements of a
+/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
+/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+/// zero-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi32(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
+}
+
+/// Zero-extends each of the lower two 8-bit integer elements of a
+/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
+/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+/// zero-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu8_epi64(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
+}
+
+/// Zero-extends each of the lower four 16-bit integer elements of a
+/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
+/// a 128-bit vector of [4 x i32]. The upper four elements of the input
+/// vector are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+/// zero-extended to 32-bit values.
+/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi32(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
+}
+
+/// Zero-extends each of the lower two 16-bit integer elements of a
+/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
+/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
+/// are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+/// zero-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu16_epi64(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
+}
+
+/// Zero-extends each of the lower two 32-bit integer elements of a
+/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
+/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
+/// are unused.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
+///
+/// \param __V
+/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+/// zero-extended to 64-bit values.
+/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cvtepu32_epi64(__m128i __V)
+{
+ return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
+}
+
+/* SSE4 Pack with Unsigned Saturation. */
+/// Converts 32-bit signed integers from both 128-bit integer vector
+/// operands into 16-bit unsigned integers, and returns the packed result.
+/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
+/// 0x0000 are saturated to 0x0000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
+///
+/// \param __V1
+/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
+/// signed integer and is converted to a 16-bit unsigned integer with
+/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
+/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
+/// are written to the lower 64 bits of the result.
+/// \param __V2
+/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
+/// signed integer and is converted to a 16-bit unsigned integer with
+/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
+/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
+/// are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_packus_epi32(__m128i __V1, __m128i __V2)
+{
+ return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
+}
+
+/* SSE4 Multiple Packed Sums of Absolute Difference. */
+/// Subtracts 8-bit unsigned integer values and computes the absolute
+/// values of the differences to the corresponding bits in the destination.
+/// Then sums of the absolute differences are returned according to the bit
+/// fields in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
+///
+/// \param X
+/// A 128-bit vector of [16 x i8].
+/// \param Y
+/// A 128-bit vector of [16 x i8].
+/// \param M
+/// An 8-bit immediate operand specifying how the absolute differences are to
+/// be calculated, according to the following algorithm:
+/// \code
+/// // M2 represents bit 2 of the immediate operand
+/// // M10 represents bits [1:0] of the immediate operand
+/// i = M2 * 4;
+/// j = M10 * 4;
+/// for (k = 0; k < 8; k = k + 1) {
+/// d0 = abs(X[i + k + 0] - Y[j + 0]);
+/// d1 = abs(X[i + k + 1] - Y[j + 1]);
+/// d2 = abs(X[i + k + 2] - Y[j + 2]);
+/// d3 = abs(X[i + k + 3] - Y[j + 3]);
+/// r[k] = d0 + d1 + d2 + d3;
+/// }
+/// \endcode
+/// \returns A 128-bit integer vector containing the sums of the sets of
+/// absolute differences between both operands.
+#define _mm_mpsadbw_epu8(X, Y, M) \
+ ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (M)))
+
+/// Finds the minimum unsigned 16-bit element in the input 128-bit
+/// vector of [8 x u16] and returns it and along with its index.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
+/// instruction.
+///
+/// \param __V
+/// A 128-bit vector of [8 x u16].
+/// \returns A 128-bit value where bits [15:0] contain the minimum value found
+/// in parameter \a __V, bits [18:16] contain the index of the minimum value
+/// and the remaining bits are set to 0.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_minpos_epu16(__m128i __V)
+{
+ return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
+}
+
+/* Handle the sse4.2 definitions here. */
+
+/* These definitions are normally in nmmintrin.h, but gcc puts them in here
+ so we'll do the same. */
+
+#undef __DEFAULT_FN_ATTRS
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
+
+/* These specify the type of data that we're comparing. */
+#define _SIDD_UBYTE_OPS 0x00
+#define _SIDD_UWORD_OPS 0x01
+#define _SIDD_SBYTE_OPS 0x02
+#define _SIDD_SWORD_OPS 0x03
+
+/* These specify the type of comparison operation. */
+#define _SIDD_CMP_EQUAL_ANY 0x00
+#define _SIDD_CMP_RANGES 0x04
+#define _SIDD_CMP_EQUAL_EACH 0x08
+#define _SIDD_CMP_EQUAL_ORDERED 0x0c
+
+/* These macros specify the polarity of the operation. */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_NEGATIVE_POLARITY 0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
+
+/* These macros are used in _mm_cmpXstri() to specify the return. */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* These macros are used in _mm_cmpXstri() to specify the return. */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* SSE4.2 Packed Comparison Intrinsics. */
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with implicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns a 128-bit integer vector representing the result
+/// mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words, the type of comparison to perform, and the format of the return
+/// value. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
+/// bytes. \n
+/// 0: The result is zero-extended to 16 bytes. \n
+/// 1: The result is expanded to 16 bytes (this expansion is performed by
+/// repeating each bit 8 or 16 times).
+/// \returns Returns a 128-bit integer vector representing the result mask of
+/// the comparison.
+#define _mm_cmpistrm(A, B, M) \
+ ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with implicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns an integer representing the result index of the
+/// comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words, the type of comparison to perform, and the format of the return
+/// value. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// Bit [6]: Determines whether the index of the lowest set bit or the
+/// highest set bit is returned. \n
+/// 0: The index of the least significant set bit. \n
+/// 1: The index of the most significant set bit. \n
+/// \returns Returns an integer representing the result index of the comparison.
+#define _mm_cmpistri(A, B, M) \
+ ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with explicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns a 128-bit integer vector representing the result
+/// mask of the comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LA
+/// An integer that specifies the length of the string in \a A.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LB
+/// An integer that specifies the length of the string in \a B.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words, the type of comparison to perform, and the format of the return
+/// value. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
+/// bytes. \n
+/// 0: The result is zero-extended to 16 bytes. \n
+/// 1: The result is expanded to 16 bytes (this expansion is performed by
+/// repeating each bit 8 or 16 times). \n
+/// \returns Returns a 128-bit integer vector representing the result mask of
+/// the comparison.
+#define _mm_cmpestrm(A, LA, B, LB, M) \
+ ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with explicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns an integer representing the result index of the
+/// comparison.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LA
+/// An integer that specifies the length of the string in \a A.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LB
+/// An integer that specifies the length of the string in \a B.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words, the type of comparison to perform, and the format of the return
+/// value. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// Bit [6]: Determines whether the index of the lowest set bit or the
+/// highest set bit is returned. \n
+/// 0: The index of the least significant set bit. \n
+/// 1: The index of the most significant set bit. \n
+/// \returns Returns an integer representing the result index of the comparison.
+#define _mm_cmpestri(A, LA, B, LB, M) \
+ ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M)))
+
+/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with implicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
+/// string in \a B is the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// \returns Returns 1 if the bit mask is zero and the length of the string in
+/// \a B is the maximum; otherwise, returns 0.
+#define _mm_cmpistra(A, B, M) \
+ ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with implicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
+/// 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B.
+/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
+#define _mm_cmpistrc(A, B, M) \
+ ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with implicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns bit 0 of the resulting bit mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// \returns Returns bit 0 of the resulting bit mask.
+#define _mm_cmpistro(A, B, M) \
+ ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with implicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
+/// the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// \returns Returns 1 if the length of the string in \a A is less than the
+/// maximum, otherwise, returns 0.
+#define _mm_cmpistrs(A, B, M) \
+ ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with implicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
+/// the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B.
+/// \returns Returns 1 if the length of the string in \a B is less than the
+/// maximum, otherwise, returns 0.
+#define _mm_cmpistrz(A, B, M) \
+ ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with explicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
+/// string in \a B is the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LA
+/// An integer that specifies the length of the string in \a A.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LB
+/// An integer that specifies the length of the string in \a B.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B.
+/// \returns Returns 1 if the bit mask is zero and the length of the string in
+/// \a B is the maximum, otherwise, returns 0.
+#define _mm_cmpestra(A, LA, B, LB, M) \
+ ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with explicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
+/// returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LA
+/// An integer that specifies the length of the string in \a A.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LB
+/// An integer that specifies the length of the string in \a B.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
+#define _mm_cmpestrc(A, LA, B, LB, M) \
+ ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with explicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns bit 0 of the resulting bit mask.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LA
+/// An integer that specifies the length of the string in \a A.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LB
+/// An integer that specifies the length of the string in \a B.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B.
+/// \returns Returns bit 0 of the resulting bit mask.
+#define _mm_cmpestro(A, LA, B, LB, M) \
+ ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with explicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
+/// the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
+/// instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LA
+/// An integer that specifies the length of the string in \a A.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LB
+/// An integer that specifies the length of the string in \a B.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement in the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B. \n
+/// \returns Returns 1 if the length of the string in \a A is less than the
+/// maximum, otherwise, returns 0.
+#define _mm_cmpestrs(A, LA, B, LB, M) \
+ ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M)))
+
+/// Uses the immediate operand \a M to perform a comparison of string
+/// data with explicitly defined lengths that is contained in source operands
+/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
+/// the maximum, otherwise, returns 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
+///
+/// \param A
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LA
+/// An integer that specifies the length of the string in \a A.
+/// \param B
+/// A 128-bit integer vector containing one of the source operands to be
+/// compared.
+/// \param LB
+/// An integer that specifies the length of the string in \a B.
+/// \param M
+/// An 8-bit immediate operand specifying whether the characters are bytes or
+/// words and the type of comparison to perform. \n
+/// Bits [1:0]: Determine source data format. \n
+/// 00: 16 unsigned bytes \n
+/// 01: 8 unsigned words \n
+/// 10: 16 signed bytes \n
+/// 11: 8 signed words \n
+/// Bits [3:2]: Determine comparison type and aggregation method. \n
+/// 00: Subset: Each character in \a B is compared for equality with all
+/// the characters in \a A. \n
+/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
+/// basis is greater than or equal for even-indexed elements in \a A,
+/// and less than or equal for odd-indexed elements in \a A. \n
+/// 10: Match: Compare each pair of corresponding characters in \a A and
+/// \a B for equality. \n
+/// 11: Substring: Search \a B for substring matches of \a A. \n
+/// Bits [5:4]: Determine whether to perform a one's complement on the bit
+/// mask of the comparison results. \n
+/// 00: No effect. \n
+/// 01: Negate the bit mask. \n
+/// 10: No effect. \n
+/// 11: Negate the bit mask only for bits with an index less than or equal
+/// to the size of \a A or \a B.
+/// \returns Returns 1 if the length of the string in \a B is less than the
+/// maximum, otherwise, returns 0.
+#define _mm_cmpestrz(A, LA, B, LB, M) \
+ ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
+ (__v16qi)(__m128i)(B), (int)(LB), \
+ (int)(M)))
+
+/* SSE4.2 Compare Packed Data -- Greater Than. */
+/// Compares each of the corresponding 64-bit values of the 128-bit
+/// integer vectors to determine if the values in the first operand are
+/// greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
+///
+/// \param __V1
+/// A 128-bit integer vector.
+/// \param __V2
+/// A 128-bit integer vector.
+/// \returns A 128-bit integer vector containing the comparison results.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
+{
+ return (__m128i)((__v2di)__V1 > (__v2di)__V2);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#include <popcntintrin.h>
+
+#include <crc32intrin.h>
+
+#endif /* __SMMINTRIN_H */
--- /dev/null
+/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __TBMINTRIN_H
+#define __TBMINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
+
+#define __bextri_u32(a, b) \
+ ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
+ (unsigned int)(b)))
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcfill_u32(unsigned int __a)
+{
+ return __a & (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blci_u32(unsigned int __a)
+{
+ return __a | ~(__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcic_u32(unsigned int __a)
+{
+ return ~__a & (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcmsk_u32(unsigned int __a)
+{
+ return __a ^ (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blcs_u32(unsigned int __a)
+{
+ return __a | (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsfill_u32(unsigned int __a)
+{
+ return __a | (__a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__blsic_u32(unsigned int __a)
+{
+ return ~__a | (__a - 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__t1mskc_u32(unsigned int __a)
+{
+ return ~__a | (__a + 1);
+}
+
+static __inline__ unsigned int __DEFAULT_FN_ATTRS
+__tzmsk_u32(unsigned int __a)
+{
+ return ~__a & (__a - 1);
+}
+
+#ifdef __x86_64__
+#define __bextri_u64(a, b) \
+ ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
+ (unsigned long long)(b)))
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcfill_u64(unsigned long long __a)
+{
+ return __a & (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blci_u64(unsigned long long __a)
+{
+ return __a | ~(__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcic_u64(unsigned long long __a)
+{
+ return ~__a & (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcmsk_u64(unsigned long long __a)
+{
+ return __a ^ (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blcs_u64(unsigned long long __a)
+{
+ return __a | (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsfill_u64(unsigned long long __a)
+{
+ return __a | (__a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__blsic_u64(unsigned long long __a)
+{
+ return ~__a | (__a - 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__t1mskc_u64(unsigned long long __a)
+{
+ return ~__a | (__a + 1);
+}
+
+static __inline__ unsigned long long __DEFAULT_FN_ATTRS
+__tzmsk_u64(unsigned long long __a)
+{
+ return ~__a & (__a - 1);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __TBMINTRIN_H */
--- /dev/null
+/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __TMMINTRIN_H
+#define __TMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <pmmintrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
+
+/// Computes the absolute value of each of the packed 8-bit signed
+/// integers in the source operand and stores the 8-bit unsigned integer
+/// results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSB instruction.
+///
+/// \param __a
+/// A 64-bit vector of [8 x i8].
+/// \returns A 64-bit integer vector containing the absolute values of the
+/// elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_abs_pi8(__m64 __a)
+{
+ return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
+}
+
+/// Computes the absolute value of each of the packed 8-bit signed
+/// integers in the source operand and stores the 8-bit unsigned integer
+/// results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSB instruction.
+///
+/// \param __a
+/// A 128-bit vector of [16 x i8].
+/// \returns A 128-bit integer vector containing the absolute values of the
+/// elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi8(__m128i __a)
+{
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
+#else
+ return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
+#endif
+}
+
+/// Computes the absolute value of each of the packed 16-bit signed
+/// integers in the source operand and stores the 16-bit unsigned integer
+/// results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSW instruction.
+///
+/// \param __a
+/// A 64-bit vector of [4 x i16].
+/// \returns A 64-bit integer vector containing the absolute values of the
+/// elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_abs_pi16(__m64 __a)
+{
+ return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
+}
+
+/// Computes the absolute value of each of the packed 16-bit signed
+/// integers in the source operand and stores the 16-bit unsigned integer
+/// results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSW instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16].
+/// \returns A 128-bit integer vector containing the absolute values of the
+/// elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi16(__m128i __a)
+{
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
+#else
+ return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
+#endif
+}
+
+/// Computes the absolute value of each of the packed 32-bit signed
+/// integers in the source operand and stores the 32-bit unsigned integer
+/// results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PABSD instruction.
+///
+/// \param __a
+/// A 64-bit vector of [2 x i32].
+/// \returns A 64-bit integer vector containing the absolute values of the
+/// elements in the operand.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_abs_pi32(__m64 __a)
+{
+ return (__m64)__builtin_ia32_pabsd((__v2si)__a);
+}
+
+/// Computes the absolute value of each of the packed 32-bit signed
+/// integers in the source operand and stores the 32-bit unsigned integer
+/// results in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPABSD instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32].
+/// \returns A 128-bit integer vector containing the absolute values of the
+/// elements in the operand.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_abs_epi32(__m128i __a)
+{
+#if (__clang_major__ < 14)
+ return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
+#else
+ return (__m128i)__builtin_elementwise_abs((__v4si)__a);
+#endif
+}
+
+/// Concatenates the two 128-bit integer vector operands, and
+/// right-shifts the result by the number of bytes specified in the immediate
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+/// A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param b
+/// A 128-bit vector of [16 x i8] containing one of the source operands.
+/// \param n
+/// An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 128-bit integer vector containing the concatenated right-shifted
+/// value.
+#define _mm_alignr_epi8(a, b, n) \
+ ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
+ (__v16qi)(__m128i)(b), (n)))
+
+/// Concatenates the two 64-bit integer vector operands, and right-shifts
+/// the result by the number of bytes specified in the immediate operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c PALIGNR instruction.
+///
+/// \param a
+/// A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param b
+/// A 64-bit vector of [8 x i8] containing one of the source operands.
+/// \param n
+/// An immediate operand specifying how many bytes to right-shift the result.
+/// \returns A 64-bit integer vector containing the concatenated right-shifted
+/// value.
+#define _mm_alignr_pi8(a, b, n) \
+ ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDW instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the upper bits of the
+/// destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
+/// both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDD instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32] containing one of the source operands. The
+/// horizontal sums of the values are stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 128-bit vector of [4 x i32] containing one of the source operands. The
+/// horizontal sums of the values are stored in the upper bits of the
+/// destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
+/// both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadd_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDW instruction.
+///
+/// \param __a
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the upper bits of the
+/// destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
+/// operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_hadd_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDD instruction.
+///
+/// \param __a
+/// A 64-bit vector of [2 x i32] containing one of the source operands. The
+/// horizontal sums of the values are stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 64-bit vector of [2 x i32] containing one of the source operands. The
+/// horizontal sums of the values are stored in the upper bits of the
+/// destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
+/// operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_hadd_pi32(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
+/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+/// 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHADDSW instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the upper bits of the
+/// destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+/// sums of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hadds_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Horizontally adds the adjacent pairs of values contained in 2 packed
+/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
+/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
+/// 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHADDSW instruction.
+///
+/// \param __a
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the lower bits of the
+/// destination.
+/// \param __b
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal sums of the values are stored in the upper bits of the
+/// destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+/// sums of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_hadds_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBW instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the lower bits of
+/// the destination.
+/// \param __b
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the upper bits of
+/// the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
+/// of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBD instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x i32] containing one of the source operands. The
+/// horizontal differences between the values are stored in the lower bits of
+/// the destination.
+/// \param __b
+/// A 128-bit vector of [4 x i32] containing one of the source operands. The
+/// horizontal differences between the values are stored in the upper bits of
+/// the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
+/// of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsub_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of [4 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBW instruction.
+///
+/// \param __a
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the lower bits of
+/// the destination.
+/// \param __b
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the upper bits of
+/// the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
+/// of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_hsub_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBD instruction.
+///
+/// \param __a
+/// A 64-bit vector of [2 x i32] containing one of the source operands. The
+/// horizontal differences between the values are stored in the lower bits of
+/// the destination.
+/// \param __b
+/// A 64-bit vector of [2 x i32] containing one of the source operands. The
+/// horizontal differences between the values are stored in the upper bits of
+/// the destination.
+/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
+/// of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_hsub_pi32(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
+/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
+/// saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPHSUBSW instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the lower bits of
+/// the destination.
+/// \param __b
+/// A 128-bit vector of [8 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the upper bits of
+/// the destination.
+/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
+/// differences of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubs_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Horizontally subtracts the adjacent pairs of values contained in 2
+/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
+/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
+/// saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PHSUBSW instruction.
+///
+/// \param __a
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the lower bits of
+/// the destination.
+/// \param __b
+/// A 64-bit vector of [4 x i16] containing one of the source operands. The
+/// horizontal differences between the values are stored in the upper bits of
+/// the destination.
+/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
+/// differences of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_hsubs_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, adds pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
+/// both operands are multiplied, and the sum of both results is written to
+/// bits [15:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the first source operand.
+/// \param __b
+/// A 128-bit integer vector containing the second source operand.
+/// \returns A 128-bit integer vector containing the sums of products of both
+/// operands: \n
+/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
+/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
+/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
+/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
+/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddubs_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// Multiplies corresponding pairs of packed 8-bit unsigned integer
+/// values contained in the first source operand and packed 8-bit signed
+/// integer values contained in the second source operand, adds pairs of
+/// contiguous products with signed saturation, and writes the 16-bit sums to
+/// the corresponding bits in the destination.
+///
+/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
+/// both operands are multiplied, and the sum of both results is written to
+/// bits [15:0] of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMADDUBSW instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing the first source operand.
+/// \param __b
+/// A 64-bit integer vector containing the second source operand.
+/// \returns A 64-bit integer vector containing the sums of products of both
+/// operands: \n
+/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
+/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
+/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
+/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_maddubs_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
+}
+
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// products to the 18 most significant bits by right-shifting, rounds the
+/// truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMULHRSW instruction.
+///
+/// \param __a
+/// A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [8 x i16] containing one of the source operands.
+/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
+/// products of both operands.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_mulhrs_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
+/// products to the 18 most significant bits by right-shifting, rounds the
+/// truncated value by adding 1, and writes bits [16:1] to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PMULHRSW instruction.
+///
+/// \param __a
+/// A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \param __b
+/// A 64-bit vector of [4 x i16] containing one of the source operands.
+/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
+/// products of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_mulhrs_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Copies the 8-bit integers from a 128-bit integer vector to the
+/// destination or clears 8-bit values in the destination, as specified by
+/// the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSHUFB instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 128-bit integer vector containing control bytes corresponding to
+/// positions in the destination:
+/// Bit 7: \n
+/// 1: Clear the corresponding byte in the destination. \n
+/// 0: Copy the selected source byte to the corresponding byte in the
+/// destination. \n
+/// Bits [6:4] Reserved. \n
+/// Bits [3:0] select the source byte to be copied.
+/// \returns A 128-bit integer vector containing the copied or cleared values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shuffle_epi8(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// Copies the 8-bit integers from a 64-bit integer vector to the
+/// destination or clears 8-bit values in the destination, as specified by
+/// the second source operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSHUFB instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 64-bit integer vector containing control bytes corresponding to
+/// positions in the destination:
+/// Bit 7: \n
+/// 1: Clear the corresponding byte in the destination. \n
+/// 0: Copy the selected source byte to the corresponding byte in the
+/// destination. \n
+/// Bits [3:0] select the source byte to be copied.
+/// \returns A 64-bit integer vector containing the copied or cleared values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_shuffle_pi8(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// For each 8-bit integer in the first source operand, perform one of
+/// the following actions as specified by the second source operand.
+///
+/// If the byte in the second source is negative, calculate the two's
+/// complement of the corresponding byte in the first source, and write that
+/// value to the destination. If the byte in the second source is positive,
+/// copy the corresponding byte from the first source to the destination. If
+/// the byte in the second source is zero, clear the corresponding byte in
+/// the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNB instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 128-bit integer vector containing control bytes corresponding to
+/// positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi8(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+}
+
+/// For each 16-bit integer in the first source operand, perform one of
+/// the following actions as specified by the second source operand.
+///
+/// If the word in the second source is negative, calculate the two's
+/// complement of the corresponding word in the first source, and write that
+/// value to the destination. If the word in the second source is positive,
+/// copy the corresponding word from the first source to the destination. If
+/// the word in the second source is zero, clear the corresponding word in
+/// the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGNW instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 128-bit integer vector containing control words corresponding to
+/// positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi16(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+}
+
+/// For each 32-bit integer in the first source operand, perform one of
+/// the following actions as specified by the second source operand.
+///
+/// If the doubleword in the second source is negative, calculate the two's
+/// complement of the corresponding word in the first source, and write that
+/// value to the destination. If the doubleword in the second source is
+/// positive, copy the corresponding word from the first source to the
+/// destination. If the doubleword in the second source is zero, clear the
+/// corresponding word in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPSIGND instruction.
+///
+/// \param __a
+/// A 128-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 128-bit integer vector containing control doublewords corresponding to
+/// positions in the destination.
+/// \returns A 128-bit integer vector containing the resultant values.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sign_epi32(__m128i __a, __m128i __b)
+{
+ return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+}
+
+/// For each 8-bit integer in the first source operand, perform one of
+/// the following actions as specified by the second source operand.
+///
+/// If the byte in the second source is negative, calculate the two's
+/// complement of the corresponding byte in the first source, and write that
+/// value to the destination. If the byte in the second source is positive,
+/// copy the corresponding byte from the first source to the destination. If
+/// the byte in the second source is zero, clear the corresponding byte in
+/// the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNB instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 64-bit integer vector containing control bytes corresponding to
+/// positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_sign_pi8(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// For each 16-bit integer in the first source operand, perform one of
+/// the following actions as specified by the second source operand.
+///
+/// If the word in the second source is negative, calculate the two's
+/// complement of the corresponding word in the first source, and write that
+/// value to the destination. If the word in the second source is positive,
+/// copy the corresponding word from the first source to the destination. If
+/// the word in the second source is zero, clear the corresponding word in
+/// the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGNW instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 64-bit integer vector containing control words corresponding to
+/// positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_sign_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// For each 32-bit integer in the first source operand, perform one of
+/// the following actions as specified by the second source operand.
+///
+/// If the doubleword in the second source is negative, calculate the two's
+/// complement of the corresponding doubleword in the first source, and
+/// write that value to the destination. If the doubleword in the second
+/// source is positive, copy the corresponding doubleword from the first
+/// source to the destination. If the doubleword in the second source is
+/// zero, clear the corresponding doubleword in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PSIGND instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing the values to be copied.
+/// \param __b
+/// A 64-bit integer vector containing two control doublewords corresponding
+/// to positions in the destination.
+/// \returns A 64-bit integer vector containing the resultant values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_sign_pi32(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_MMX
+
+#endif /* __TMMINTRIN_H */
--- /dev/null
+/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __TSXLDTRKINTRIN_H
+#define __TSXLDTRKINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define _DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk")))
+
+/// Marks the start of an TSX (RTM) suspend load address tracking region. If
+/// this intrinsic is used inside a transactional region, subsequent loads
+/// are not added to the read set of the transaction. If it's used inside a
+/// suspend load address tracking region it will cause transaction abort.
+/// If it's used outside of a transactional region it behaves like a NOP.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c XSUSLDTRK instruction.
+///
+static __inline__ void _DEFAULT_FN_ATTRS
+_xsusldtrk (void)
+{
+ __builtin_ia32_xsusldtrk();
+}
+
+/// Marks the end of an TSX (RTM) suspend load address tracking region. If this
+/// intrinsic is used inside a suspend load address tracking region it will
+/// end the suspend region and all following load addresses will be added to
+/// the transaction read set. If it's used inside an active transaction but
+/// not in a suspend region it will cause transaction abort. If it's used
+/// outside of a transactional region it behaves like a NOP.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c XRESLDTRK instruction.
+///
+static __inline__ void _DEFAULT_FN_ATTRS
+_xresldtrk (void)
+{
+ __builtin_ia32_xresldtrk();
+}
+
+#undef _DEFAULT_FN_ATTRS
+
+#endif /* __TSXLDTRKINTRIN_H */
--- /dev/null
+/*===------------------ uintrintrin.h - UINTR intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef __UINTRINTRIN_H
+#define __UINTRINTRIN_H
+
+/* Define the default attributes for the functions in this file */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("uintr")))
+
+#ifdef __x86_64__
+
+struct __uintr_frame
+{
+ unsigned long long rip;
+ unsigned long long rflags;
+ unsigned long long rsp;
+};
+
+/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
+/// user interrupt cannot be delivered on the instruction boundary following
+/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
+/// 64-bit mode, and software is not executing inside an enclave; otherwise,
+/// each causes an invalid-opcode exception. Causes a transactional abort if
+/// executed inside a transactional region; the abort loads EAX as it would
+/// had it been due to an execution of CLI.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> CLUI </c> instruction.
+///
+/// \operation
+/// UIF := 0
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_clui (void)
+{
+ __builtin_ia32_clui();
+}
+
+/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a
+/// user interrupt may be delivered on the instruction boundary following
+/// STUI. Can be executed only if CR4.UINT = 1, the logical processor is in
+/// 64-bit mode, and software is not executing inside an enclave; otherwise,
+/// each causes an invalid-opcode exception. Causes a transactional abort if
+/// executed inside a transactional region; the abort loads EAX as it would
+/// had it been due to an execution of STI.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> STUI </c> instruction.
+///
+/// \operation
+/// UIF := 1
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_stui (void)
+{
+ __builtin_ia32_stui();
+}
+
+/// Get the current value of the user interrupt flag (UIF). Can be executed
+/// regardless of CPL and inside a transactional region. Can be executed only
+/// if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is
+/// not executing inside an enclave; otherwise, it causes an invalid-opcode
+/// exception.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> TESTUI </c> instruction.
+///
+/// \returns The current value of the user interrupt flag (UIF).
+///
+/// \operation
+/// CF := UIF
+/// ZF := 0
+/// AF := 0
+/// OF := 0
+/// PF := 0
+/// SF := 0
+/// dst := CF
+/// \endoperation
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_testui (void)
+{
+ return __builtin_ia32_testui();
+}
+
+/// Send interprocessor user interrupt. Can be executed only if
+/// CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode,
+/// and software is not executing inside an enclave; otherwise, it causes an
+/// invalid-opcode exception. May be executed at any privilege level, all of
+/// its memory accesses are performed with supervisor privilege.
+///
+/// \headerfile <x86gprintrin.h>
+///
+/// This intrinsic corresponds to the <c> SENDUIPI </c> instruction
+///
+/// \param __a
+/// Index of user-interrupt target table entry in user-interrupt target
+/// table.
+///
+/// \operation
+/// IF __a > UITTSZ
+/// GP (0)
+/// FI
+/// tempUITTE := MEM[UITTADDR + (a<<4)]
+/// // tempUITTE must be valid, and can't have any reserved bit set
+/// IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0)
+/// GP (0)
+/// FI
+/// tempUPID := MEM[tempUITTE.UPIDADDR] // under lock
+/// // tempUPID can't have any reserved bit set
+/// IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0)
+/// GP (0) // release lock
+/// FI
+/// tempUPID.PIR[tempUITTE.UV] := 1;
+/// IF (tempUPID.SN == 0 AND tempUPID.ON == 0)
+/// tempUPID.ON := 1
+/// sendNotify := 1
+/// ELSE
+/// sendNotify := 0
+/// FI
+/// MEM[tempUITTE.UPIDADDR] := tempUPID // release lock
+/// IF sendNotify == 1
+/// IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode
+/// // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC
+/// // ID tempUPID.NDST
+/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST)
+/// ELSE
+/// // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC
+/// // ID tempUPID.NDST[15:8]
+/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
+/// FI
+/// FI
+/// \endoperation
+static __inline__ void __DEFAULT_FN_ATTRS
+_senduipi (unsigned long long __a)
+{
+ __builtin_ia32_senduipi(__a);
+}
+
+#endif /* __x86_64__ */
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __UINTRINTRIN_H */
--- /dev/null
+/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VAESINTRIN_H
+#define __VAESINTRIN_H
+
+/* Default attributes for YMM forms. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))
+
+/* Default attributes for ZMM forms. */
+#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512)))
+
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesenc_epi128(__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
+ (__v4di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesdec_epi128(__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
+ (__v4di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesenclast_epi128(__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
+ (__v4di) __B);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+ _mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
+ (__v4di) __B);
+}
+
+#ifdef __AVX512FINTRIN_H
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenc_epi128(__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
+ (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdec_epi128(__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
+ (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
+ (__v8di) __B);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS_F
+ _mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
+ (__v8di) __B);
+}
+#endif // __AVX512FINTRIN_H
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_F
+
+#endif // __VAESINTRIN_H
--- /dev/null
+/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __VPCLMULQDQINTRIN_H
+#define __VPCLMULQDQINTRIN_H
+
+#define _mm256_clmulepi64_epi128(A, B, I) \
+ ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), \
+ (char)(I)))
+
+#ifdef __AVX512FINTRIN_H
+#define _mm512_clmulepi64_epi128(A, B, I) \
+ ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), \
+ (char)(I)))
+#endif // __AVX512FINTRIN_H
+
+#endif /* __VPCLMULQDQINTRIN_H */
+
--- /dev/null
+/*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __WAITPKGINTRIN_H
+#define __WAITPKGINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("waitpkg")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_umonitor (void * __address)
+{
+ __builtin_ia32_umonitor (__address);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_umwait (unsigned int __control, unsigned long long __counter)
+{
+ return __builtin_ia32_umwait (__control,
+ (unsigned int)(__counter >> 32), (unsigned int)__counter);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS
+_tpause (unsigned int __control, unsigned long long __counter)
+{
+ return __builtin_ia32_tpause (__control,
+ (unsigned int)(__counter >> 32), (unsigned int)__counter);
+}
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __WAITPKGINTRIN_H */
--- /dev/null
+/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
+#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __WBNOINVDINTRIN_H
+#define __WBNOINVDINTRIN_H
+
+static __inline__ void
+ __attribute__((__always_inline__, __nodebug__, __target__("wbnoinvd")))
+_wbnoinvd (void)
+{
+ __builtin_ia32_wbnoinvd ();
+}
+
+#endif /* __WBNOINVDINTRIN_H */
--- /dev/null
+/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __WMMINTRIN_H
+#define __WMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <emmintrin.h>
+
+#include <__wmmintrin_aes.h>
+
+#include <__wmmintrin_pclmul.h>
+
+#endif /* __WMMINTRIN_H */
--- /dev/null
+/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#define __X86GPRINTRIN_H
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__HRESET__)
+#include <hresetintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__UINTR__)
+#include <uintrintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__CRC32__)
+#include <crc32intrin.h>
+#endif
+
+#define __SSC_MARK(Tag) \
+ __asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; " \
+ "mov {%0, %%ebx|ebx, %0}; " \
+ ".byte 0x64, 0x67, 0x90; " \
+ "mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag) \
+ : "%eax");
+
+#endif /* __X86GPRINTRIN_H */
--- /dev/null
+/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#define __X86INTRIN_H
+
+#include <ia32intrin.h>
+
+#include <immintrin.h>
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__3dNOW__)
+#include <mm3dnow.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__PRFCHW__)
+#include <prfchwintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__SSE4A__)
+#include <ammintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__FMA4__)
+#include <fma4intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__XOP__)
+#include <xopintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__TBM__)
+#include <tbmintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__LWP__)
+#include <lwpintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__MWAITX__)
+#include <mwaitxintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
+ defined(__CLZERO__)
+#include <clzerointrin.h>
+#endif
+
+
+#endif /* __X86INTRIN_H */
--- /dev/null
+/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __XMMINTRIN_H
+#define __XMMINTRIN_H
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#error "This header is only meant to be used on x86 and x64 architecture"
+#endif
+
+#include <mmintrin.h>
+
+typedef int __v4si __attribute__((__vector_size__(16)));
+typedef float __v4sf __attribute__((__vector_size__(16)));
+typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
+
+typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
+
+/* Unsigned types */
+typedef unsigned int __v4su __attribute__((__vector_size__(16)));
+
+/* This header should only be included in a hosted environment as it depends on
+ * a standard library to provide allocation routines. */
+#if __STDC_HOSTED__
+#include <mm_malloc.h>
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
+
+/// Adds the 32-bit float values in the low-order bits of the operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
+/// of the lower 32 bits of both operands. The upper 96 bits are copied from
+/// the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ss(__m128 __a, __m128 __b)
+{
+ __a[0] += __b[0];
+ return __a;
+}
+
+/// Adds two 128-bit vectors of [4 x float], and returns the results of
+/// the addition.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the sums of both
+/// operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_add_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)((__v4sf)__a + (__v4sf)__b);
+}
+
+/// Subtracts the 32-bit float value in the low-order bits of the second
+/// operand from the corresponding value in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
+/// of this operand are used in the calculation.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
+/// bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+/// difference of the lower 32 bits of both operands. The upper 96 bits are
+/// copied from the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ss(__m128 __a, __m128 __b)
+{
+ __a[0] -= __b[0];
+ return __a;
+}
+
+/// Subtracts each of the values of the second operand from the first
+/// operand, both of which are 128-bit vectors of [4 x float] and returns
+/// the results of the subtraction.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the minuend.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing the subtrahend.
+/// \returns A 128-bit vector of [4 x float] containing the differences between
+/// both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sub_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)((__v4sf)__a - (__v4sf)__b);
+}
+
+/// Multiplies two 32-bit float values in the low-order bits of the
+/// operands.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The lower 32 bits of this operand are used in the calculation.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// The lower 32 bits of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the product of the lower
+/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
+/// bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ss(__m128 __a, __m128 __b)
+{
+ __a[0] *= __b[0];
+ return __a;
+}
+
+/// Multiplies two 128-bit vectors of [4 x float] and returns the
+/// results of the multiplication.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the products of both
+/// operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_mul_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)((__v4sf)__a * (__v4sf)__b);
+}
+
+/// Divides the value in the low-order 32 bits of the first operand by
+/// the corresponding value in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
+/// bits of this operand are used in the calculation.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
+/// of this operand are used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of the
+/// lower 32 bits of both operands. The upper 96 bits are copied from the
+/// upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ss(__m128 __a, __m128 __b)
+{
+ __a[0] /= __b[0];
+ return __a;
+}
+
+/// Divides two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the dividend.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing the divisor.
+/// \returns A 128-bit vector of [4 x float] containing the quotients of both
+/// operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_div_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)((__v4sf)__a / (__v4sf)__b);
+}
+
+/// Calculates the square root of the value stored in the low-order bits
+/// of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the square root of the
+/// value in the low-order bits of the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ss(__m128 __a)
+{
+ return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
+}
+
+/// Calculates the square roots of the values stored in a 128-bit vector
+/// of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the square roots of the
+/// values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_sqrt_ps(__m128 __a)
+{
+ return __builtin_ia32_sqrtps((__v4sf)__a);
+}
+
+/// Calculates the approximate reciprocal of the value stored in the
+/// low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+/// reciprocal of the value in the low-order bits of the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ss(__m128 __a)
+{
+ return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
+}
+
+/// Calculates the approximate reciprocals of the values stored in a
+/// 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+/// reciprocals of the values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rcp_ps(__m128 __a)
+{
+ return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
+}
+
+/// Calculates the approximate reciprocal of the square root of the value
+/// stored in the low-order bits of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the calculation.
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+/// reciprocal of the square root of the value in the low-order bits of the
+/// operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ss(__m128 __a)
+{
+ return __builtin_ia32_rsqrtss((__v4sf)__a);
+}
+
+/// Calculates the approximate reciprocals of the square roots of the
+/// values stored in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the approximate
+/// reciprocals of the square roots of the values in the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_rsqrt_ps(__m128 __a)
+{
+ return __builtin_ia32_rsqrtps((__v4sf)__a);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands and returns the lesser value in the low-order bits of the
+/// vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+/// minimum value between both operands. The upper 96 bits are copied from
+/// the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 128-bit vectors of [4 x float] and returns the lesser
+/// of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the minimum values
+/// between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_min_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands and returns the greater value in the low-order bits of a 128-bit
+/// vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+/// maximum value between both operands. The upper 96 bits are copied from
+/// the upper 96 bits of the first source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 128-bit vectors of [4 x float] and returns the greater
+/// of each pair of values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands.
+/// \returns A 128-bit vector of [4 x float] containing the maximum values
+/// between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_max_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector containing one of the source operands.
+/// \param __b
+/// A 128-bit vector containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+/// values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_and_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)((__v4su)__a & (__v4su)__b);
+}
+
+/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
+/// the one's complement of the values contained in the first source
+/// operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the first source operand. The
+/// one's complement of this value is used in the bitwise AND.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing the second source operand.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
+/// one's complement of the first operand and the values in the second
+/// operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_andnot_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)(~(__v4su)__a & (__v4su)__b);
+}
+
+/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
+/// values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_or_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)((__v4su)__a | (__v4su)__b);
+}
+
+/// Performs a bitwise exclusive OR of two 128-bit vectors of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the source operands.
+/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
+/// of the values between both operands.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_xor_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)((__v4su)__a ^ (__v4su)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands for equality and returns the result of the comparison in the
+/// low-order bits of a vector [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] for equality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpeq_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is less than the
+/// corresponding value in the second operand and returns the result of the
+/// comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmplt_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is less than or
+/// equal to the corresponding value in the second operand and returns the
+/// result of the comparison in the low-order bits of a vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmple_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is greater than
+/// the corresponding value in the second operand and returns the result of
+/// the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_shufflevector((__v4sf)__a,
+ (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
+ 4, 1, 2, 3);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpgt_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is greater than
+/// or equal to the corresponding value in the second operand and returns
+/// the result of the comparison in the low-order bits of a vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_shufflevector((__v4sf)__a,
+ (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
+ 4, 1, 2, 3);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpge_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands for inequality and returns the result of the comparison in the
+/// low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] for inequality.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpneq_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is not less than
+/// the corresponding value in the second operand and returns the result of
+/// the comparison in the low-order bits of a vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are not less than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnlt_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is not less than
+/// or equal to the corresponding value in the second operand and returns
+/// the result of the comparison in the low-order bits of a vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are not less than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnle_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is not greater
+/// than the corresponding value in the second operand and returns the
+/// result of the comparison in the low-order bits of a vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_shufflevector((__v4sf)__a,
+ (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
+ 4, 1, 2, 3);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are not greater than those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpngt_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is not greater
+/// than or equal to the corresponding value in the second operand and
+/// returns the result of the comparison in the low-order bits of a vector
+/// of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_shufflevector((__v4sf)__a,
+ (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
+ 4, 1, 2, 3);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are not greater than or equal to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpnge_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is ordered with
+/// respect to the corresponding value in the second operand and returns the
+/// result of the comparison in the low-order bits of a vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are ordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpord_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the value in the first operand is unordered
+/// with respect to the corresponding value in the second operand and
+/// returns the result of the comparison in the low-order bits of a vector
+/// of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float] containing one of the operands. The lower
+/// 32 bits of this operand are used in the comparison.
+/// \returns A 128-bit vector of [4 x float] containing the comparison results
+/// in the low-order bits.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ss(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares each of the corresponding 32-bit float values of the
+/// 128-bit vectors of [4 x float] to determine if the values in the first
+/// operand are unordered with respect to those in the second operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// \returns A 128-bit vector of [4 x float] containing the comparison results.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cmpunord_ps(__m128 __a, __m128 __b)
+{
+ return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands for equality and returns the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the
+/// two lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comieq_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the first operand is less than the second
+/// operand and returns the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comilt_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the first operand is less than or equal to the
+/// second operand and returns the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comile_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the first operand is greater than the second
+/// operand and returns the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the
+/// two lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comigt_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the first operand is greater than or equal to
+/// the second operand and returns the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comige_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Compares two 32-bit float values in the low-order bits of both
+/// operands to determine if the first operand is not equal to the second
+/// operand and returns the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the
+/// two lower 32-bit values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_comineq_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Performs an unordered comparison of two 32-bit float values using
+/// the low-order bits of both operands to determine equality and returns
+/// the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomieq_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Performs an unordered comparison of two 32-bit float values using
+/// the low-order bits of both operands to determine if the first operand is
+/// less than the second operand and returns the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomilt_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Performs an unordered comparison of two 32-bit float values using
+/// the low-order bits of both operands to determine if the first operand is
+/// less than or equal to the second operand and returns the result of the
+/// comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomile_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Performs an unordered comparison of two 32-bit float values using
+/// the low-order bits of both operands to determine if the first operand is
+/// greater than the second operand and returns the result of the
+/// comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomigt_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Performs an unordered comparison of two 32-bit float values using
+/// the low-order bits of both operands to determine if the first operand is
+/// greater than or equal to the second operand and returns the result of
+/// the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 0 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 0 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomige_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Performs an unordered comparison of two 32-bit float values using
+/// the low-order bits of both operands to determine inequality and returns
+/// the result of the comparison.
+///
+/// If either of the two lower 32-bit values is NaN, 1 is returned.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \param __b
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the comparison.
+/// \returns An integer containing the comparison results. If either of the two
+/// lower 32-bit values is NaN, 1 is returned.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_ucomineq_ss(__m128 __a, __m128 __b)
+{
+ return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
+}
+
+/// Converts a float value contained in the lower 32 bits of a vector of
+/// [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtss_si32(__m128 __a)
+{
+ return __builtin_ia32_cvtss2si((__v4sf)__a);
+}
+
+/// Converts a float value contained in the lower 32 bits of a vector of
+/// [4 x float] into a 32-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvt_ss2si(__m128 __a)
+{
+ return _mm_cvtss_si32(__a);
+}
+
+#ifdef __x86_64__
+
+/// Converts a float value contained in the lower 32 bits of a vector of
+/// [4 x float] into a 64-bit integer.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvtss_si64(__m128 __a)
+{
+ return __builtin_ia32_cvtss2si64((__v4sf)__a);
+}
+
+#endif
+
+/// Converts two low-order float values in a 128-bit vector of
+/// [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtps_pi32(__m128 __a)
+{
+ return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
+}
+
+/// Converts two low-order float values in a 128-bit vector of
+/// [4 x float] into a 64-bit vector of [2 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvt_ps2pi(__m128 __a)
+{
+ return _mm_cvtps_pi32(__a);
+}
+
+/// Converts a float value contained in the lower 32 bits of a vector of
+/// [4 x float] into a 32-bit integer, truncating the result when it is
+/// inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvttss_si32(__m128 __a)
+{
+ return __builtin_ia32_cvttss2si((__v4sf)__a);
+}
+
+/// Converts a float value contained in the lower 32 bits of a vector of
+/// [4 x float] into a 32-bit integer, truncating the result when it is
+/// inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the conversion.
+/// \returns A 32-bit integer containing the converted value.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_cvtt_ss2si(__m128 __a)
+{
+ return _mm_cvttss_si32(__a);
+}
+
+#ifdef __x86_64__
+/// Converts a float value contained in the lower 32 bits of a vector of
+/// [4 x float] into a 64-bit integer, truncating the result when it is
+/// inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the conversion.
+/// \returns A 64-bit integer containing the converted value.
+static __inline__ long long __DEFAULT_FN_ATTRS
+_mm_cvttss_si64(__m128 __a)
+{
+ return __builtin_ia32_cvttss2si64((__v4sf)__a);
+}
+#endif
+
+/// Converts two low-order float values in a 128-bit vector of
+/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
+/// when it is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
+/// instructions.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvttps_pi32(__m128 __a)
+{
+ return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
+}
+
+/// Converts two low-order float values in a 128-bit vector of [4 x
+/// float] into a 64-bit vector of [2 x i32], truncating the result when it
+/// is inexact.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \returns A 64-bit integer vector containing the converted values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtt_ps2pi(__m128 __a)
+{
+ return _mm_cvttps_pi32(__a);
+}
+
+/// Converts a 32-bit signed integer value into a floating point value
+/// and writes it to the lower 32 bits of the destination. The remaining
+/// higher order elements of the destination vector are copied from the
+/// corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+/// converted value of the second operand. The upper 96 bits are copied from
+/// the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi32_ss(__m128 __a, int __b)
+{
+ __a[0] = __b;
+ return __a;
+}
+
+/// Converts a 32-bit signed integer value into a floating point value
+/// and writes it to the lower 32 bits of the destination. The remaining
+/// higher order elements of the destination are copied from the
+/// corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 32-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+/// converted value of the second operand. The upper 96 bits are copied from
+/// the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvt_si2ss(__m128 __a, int __b)
+{
+ return _mm_cvtsi32_ss(__a, __b);
+}
+
+#ifdef __x86_64__
+
+/// Converts a 64-bit signed integer value into a floating point value
+/// and writes it to the lower 32 bits of the destination. The remaining
+/// higher order elements of the destination are copied from the
+/// corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 64-bit signed integer operand containing the value to be converted.
+/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
+/// converted value of the second operand. The upper 96 bits are copied from
+/// the upper 96 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_cvtsi64_ss(__m128 __a, long long __b)
+{
+ __a[0] = __b;
+ return __a;
+}
+
+#endif
+
+/// Converts two elements of a 64-bit vector of [2 x i32] into two
+/// floating point values and writes them to the lower 64-bits of the
+/// destination. The remaining higher order elements of the destination are
+/// copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
+/// and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+/// converted value of the second operand. The upper 64 bits are copied from
+/// the upper 64 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpi32_ps(__m128 __a, __m64 __b)
+{
+ return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
+}
+
+/// Converts two elements of a 64-bit vector of [2 x i32] into two
+/// floating point values and writes them to the lower 64-bits of the
+/// destination. The remaining higher order elements of the destination are
+/// copied from the corresponding elements in the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float].
+/// \param __b
+/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
+/// and written to the corresponding low-order elements in the destination.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+/// converted value from the second operand. The upper 64 bits are copied
+/// from the upper 64 bits of the first operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+_mm_cvt_pi2ps(__m128 __a, __m64 __b)
+{
+ return _mm_cvtpi32_ps(__a, __b);
+}
+
+/// Extracts a float value contained in the lower 32 bits of a vector of
+/// [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
+/// used in the extraction.
+/// \returns A 32-bit float containing the extracted value.
+static __inline__ float __DEFAULT_FN_ATTRS
+_mm_cvtss_f32(__m128 __a)
+{
+ return __a[0];
+}
+
+/// Loads two packed float values from the address \a __p into the
+/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
+/// are copied from the low-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
+/// of the destination.
+/// \param __p
+/// A pointer to two packed float values. Bits [63:0] are written to bits
+/// [127:64] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadh_pi(__m128 __a, const __m64 *__p)
+{
+ typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
+ struct __mm_loadh_pi_struct {
+ __mm_loadh_pi_v2f32 __u;
+ } __attribute__((__packed__, __may_alias__));
+ __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
+ __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+ return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
+}
+
+/// Loads two packed float values from the address \a __p into the
+/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
+/// are copied from the high-order bits of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
+/// [127:64] of the destination.
+/// \param __p
+/// A pointer to two packed float values. Bits [63:0] are written to bits
+/// [63:0] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the moved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadl_pi(__m128 __a, const __m64 *__p)
+{
+ typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
+ struct __mm_loadl_pi_struct {
+ __mm_loadl_pi_v2f32 __u;
+ } __attribute__((__packed__, __may_alias__));
+ __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
+ __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
+ return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// 32 bits of the vector are initialized with the single-precision
+/// floating-point value loaded from a specified memory location. The upper
+/// 96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 32-bit memory location containing a single-precision
+/// floating-point value.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+/// lower 32 bits contain the value loaded from the memory location. The
+/// upper 96 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ss(const float *__p)
+{
+ struct __mm_load_ss_struct {
+ float __u;
+ } __attribute__((__packed__, __may_alias__));
+ float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
+ return __extension__ (__m128){ __u, 0, 0, 0 };
+}
+
+/// Loads a 32-bit float value and duplicates it to all four vector
+/// elements of a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
+/// instruction.
+///
+/// \param __p
+/// A pointer to a float value to be loaded and duplicated.
+/// \returns A 128-bit vector of [4 x float] containing the loaded and
+/// duplicated values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load1_ps(const float *__p)
+{
+ struct __mm_load1_ps_struct {
+ float __u;
+ } __attribute__((__packed__, __may_alias__));
+ float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
+ return __extension__ (__m128){ __u, __u, __u, __u };
+}
+
+#define _mm_load_ps1(p) _mm_load1_ps(p)
+
+/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_load_ps(const float *__p)
+{
+ return *(const __m128*)__p;
+}
+
+/// Loads a 128-bit floating-point vector of [4 x float] from an
+/// unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \returns A 128-bit vector of [4 x float] containing the loaded values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadu_ps(const float *__p)
+{
+ struct __loadu_ps {
+ __m128_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ return ((const struct __loadu_ps*)__p)->__v;
+}
+
+/// Loads four packed float values, in reverse order, from an aligned
+/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
+/// instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 128-bit aligned.
+/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
+/// in reverse order.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_loadr_ps(const float *__p)
+{
+ __m128 __a = _mm_load_ps(__p);
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
+}
+
+/// Create a 128-bit vector of [4 x float] with undefined values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \returns A 128-bit vector of [4 x float] containing undefined values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_undefined_ps(void)
+{
+ return (__m128)__builtin_ia32_undef128();
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// 32 bits of the vector are initialized with the specified single-precision
+/// floating-point value. The upper 96 bits are set to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __w
+/// A single-precision floating-point value used to initialize the lower 32
+/// bits of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
+/// lower 32 bits contain the value provided in the source operand. The
+/// upper 96 bits are set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ss(float __w)
+{
+ return __extension__ (__m128){ __w, 0, 0, 0 };
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float], with each
+/// of the four single-precision floating-point vector elements set to the
+/// specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
+///
+/// \param __w
+/// A single-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set1_ps(float __w)
+{
+ return __extension__ (__m128){ __w, __w, __w, __w };
+}
+
+/* Microsoft specific. */
+/// Constructs a 128-bit floating-point vector of [4 x float], with each
+/// of the four single-precision floating-point vector elements set to the
+/// specified single-precision floating-point value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
+///
+/// \param __w
+/// A single-precision floating-point value used to initialize each vector
+/// element of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps1(float __w)
+{
+ return _mm_set1_ps(__w);
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float]
+/// initialized with the specified single-precision floating-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __z
+/// A single-precision floating-point value used to initialize bits [127:96]
+/// of the result.
+/// \param __y
+/// A single-precision floating-point value used to initialize bits [95:64]
+/// of the result.
+/// \param __x
+/// A single-precision floating-point value used to initialize bits [63:32]
+/// of the result.
+/// \param __w
+/// A single-precision floating-point value used to initialize bits [31:0]
+/// of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_set_ps(float __z, float __y, float __x, float __w)
+{
+ return __extension__ (__m128){ __w, __x, __y, __z };
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float],
+/// initialized in reverse order with the specified 32-bit single-precision
+/// float-point values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+/// instruction.
+///
+/// \param __z
+/// A single-precision floating-point value used to initialize bits [31:0]
+/// of the result.
+/// \param __y
+/// A single-precision floating-point value used to initialize bits [63:32]
+/// of the result.
+/// \param __x
+/// A single-precision floating-point value used to initialize bits [95:64]
+/// of the result.
+/// \param __w
+/// A single-precision floating-point value used to initialize bits [127:96]
+/// of the result.
+/// \returns An initialized 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setr_ps(float __z, float __y, float __x, float __w)
+{
+ return __extension__ (__m128){ __z, __y, __x, __w };
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float] initialized
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
+///
+/// \returns An initialized 128-bit floating-point vector of [4 x float] with
+/// all elements set to zero.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_setzero_ps(void)
+{
+ return __extension__ (__m128){ 0, 0, 0, 0 };
+}
+
+/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
+///
+/// \param __p
+/// A pointer to a 64-bit memory location.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeh_pi(__m64 *__p, __m128 __a)
+{
+ typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
+ struct __mm_storeh_pi_struct {
+ __mm_storeh_pi_v2f32 __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
+}
+
+/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a memory location that will receive the float values.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storel_pi(__m64 *__p, __m128 __a)
+{
+ typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
+ struct __mm_storeh_pi_struct {
+ __mm_storeh_pi_v2f32 __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
+}
+
+/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
+/// memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 32-bit memory location.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ss(float *__p, __m128 __a)
+{
+ struct __mm_store_ss_struct {
+ float __u;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
+}
+
+/// Stores a 128-bit vector of [4 x float] to an unaligned memory
+/// location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location does not have to be aligned.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_ps(float *__p, __m128 __a)
+{
+ struct __storeu_ps {
+ __m128_u __v;
+ } __attribute__((__packed__, __may_alias__));
+ ((struct __storeu_ps*)__p)->__v = __a;
+}
+
+/// Stores a 128-bit vector of [4 x float] into an aligned memory
+/// location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 16-byte aligned.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps(float *__p, __m128 __a)
+{
+ *(__m128*)__p = __a;
+}
+
+/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+/// four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+/// instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location.
+/// \param __a
+/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+/// of the four contiguous elements pointed by \a __p.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store1_ps(float *__p, __m128 __a)
+{
+ __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
+ _mm_store_ps(__p, __a);
+}
+
+/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
+/// four contiguous elements in an aligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
+/// instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location.
+/// \param __a
+/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
+/// of the four contiguous elements pointed by \a __p.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_store_ps1(float *__p, __m128 __a)
+{
+ _mm_store1_ps(__p, __a);
+}
+
+/// Stores float values from a 128-bit vector of [4 x float] to an
+/// aligned memory location in reverse order.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
+/// instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit memory location. The address of the memory
+/// location has to be 128-bit aligned.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storer_ps(float *__p, __m128 __a)
+{
+ __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
+ _mm_store_ps(__p, __a);
+}
+
+#define _MM_HINT_ET0 7
+#define _MM_HINT_ET1 6
+#define _MM_HINT_T0 3
+#define _MM_HINT_T1 2
+#define _MM_HINT_T2 1
+#define _MM_HINT_NTA 0
+
+#ifndef _MSC_VER
+/* FIXME: We have to #define this because "sel" must be a constant integer, and
+ Sema doesn't do any form of constant propagation yet. */
+
+/// Loads one cache line of data from the specified address to a location
+/// closer to the processor.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// void _mm_prefetch(const void * a, const int sel);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
+///
+/// \param a
+/// A pointer to a memory location containing a cache line of data.
+/// \param sel
+/// A predefined integer constant specifying the type of prefetch
+/// operation: \n
+/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
+/// PREFETCHNTA instruction will be generated. \n
+/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
+/// be generated. \n
+/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
+/// be generated. \n
+/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
+/// be generated.
+#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
+ ((sel) >> 2) & 1, (sel) & 0x3))
+#endif
+
+/// Stores a 64-bit integer in the specified aligned memory location. To
+/// minimize caching, the data is flagged as non-temporal (unlikely to be
+/// used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
+///
+/// \param __p
+/// A pointer to an aligned memory location used to store the register value.
+/// \param __a
+/// A 64-bit integer containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS_MMX
+_mm_stream_pi(__m64 *__p, __m64 __a)
+{
+ __builtin_ia32_movntq(__p, __a);
+}
+
+/// Moves packed float values from a 128-bit vector of [4 x float] to a
+/// 128-bit aligned memory location. To minimize caching, the data is flagged
+/// as non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
+///
+/// \param __p
+/// A pointer to a 128-bit aligned memory location that will receive the
+/// single-precision floating-point values.
+/// \param __a
+/// A 128-bit vector of [4 x float] containing the values to be moved.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_stream_ps(float *__p, __m128 __a)
+{
+ __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Forces strong memory ordering (serialization) between store
+/// instructions preceding this instruction and store instructions following
+/// this instruction, ensuring the system completes all previous stores
+/// before executing subsequent stores.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
+///
+void _mm_sfence(void);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
+/// returns it, as specified by the immediate integer operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _mm_extract_pi16(__m64 a, int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
+///
+/// \param a
+/// A 64-bit vector of [4 x i16].
+/// \param n
+/// An immediate integer operand that determines which bits are extracted: \n
+/// 0: Bits [15:0] are copied to the destination. \n
+/// 1: Bits [31:16] are copied to the destination. \n
+/// 2: Bits [47:32] are copied to the destination. \n
+/// 3: Bits [63:48] are copied to the destination.
+/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
+#define _mm_extract_pi16(a, n) \
+ ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
+
+/// Copies data from the 64-bit vector of [4 x i16] to the destination,
+/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
+/// specified by the immediate operand \a n.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
+///
+/// \param a
+/// A 64-bit vector of [4 x i16].
+/// \param d
+/// An integer. The lower 16-bit value from this operand is written to the
+/// destination at the offset specified by operand \a n.
+/// \param n
+/// An immediate integer operant that determines which the bits to be used
+/// in the destination. \n
+/// 0: Bits [15:0] are copied to the destination. \n
+/// 1: Bits [31:16] are copied to the destination. \n
+/// 2: Bits [47:32] are copied to the destination. \n
+/// 3: Bits [63:48] are copied to the destination. \n
+/// The remaining bits in the destination are copied from the corresponding
+/// bits in operand \a a.
+/// \returns A 64-bit integer vector containing the copied packed data from the
+/// operands.
+#define _mm_insert_pi16(a, d, n) \
+ ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
+
+/// Compares each of the corresponding packed 16-bit integer values of
+/// the 64-bit integer vectors, and writes the greater value to the
+/// corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_max_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Compares each of the corresponding packed 8-bit unsigned integer
+/// values of the 64-bit integer vectors, and writes the greater value to the
+/// corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_max_pu8(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
+}
+
+/// Compares each of the corresponding packed 16-bit integer values of
+/// the 64-bit integer vectors, and writes the lesser value to the
+/// corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_min_pi16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Compares each of the corresponding packed 8-bit unsigned integer
+/// values of the 64-bit integer vectors, and writes the lesser value to the
+/// corresponding bits in the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the comparison results.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_min_pu8(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
+}
+
+/// Takes the most significant bit from each 8-bit element in a 64-bit
+/// integer vector to create an 8-bit mask value. Zero-extends the value to
+/// 32-bit integer and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bit from each 8-bit element in \a __a,
+/// written to bits [7:0].
+static __inline__ int __DEFAULT_FN_ATTRS_MMX
+_mm_movemask_pi8(__m64 __a)
+{
+ return __builtin_ia32_pmovmskb((__v8qi)__a);
+}
+
+/// Multiplies packed 16-bit unsigned integer values and writes the
+/// high-order 16 bits of each 32-bit product to the corresponding bits in
+/// the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the products of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_mulhi_pu16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
+/// destination, as specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
+///
+/// \param a
+/// A 64-bit integer vector containing the values to be shuffled.
+/// \param n
+/// An immediate value containing an 8-bit value specifying which elements to
+/// copy from \a a. The destinations within the 64-bit destination are
+/// assigned values as follows: \n
+/// Bits [1:0] are used to assign values to bits [15:0] in the
+/// destination. \n
+/// Bits [3:2] are used to assign values to bits [31:16] in the
+/// destination. \n
+/// Bits [5:4] are used to assign values to bits [47:32] in the
+/// destination. \n
+/// Bits [7:6] are used to assign values to bits [63:48] in the
+/// destination. \n
+/// Bit value assignments: \n
+/// 00: assigned from bits [15:0] of \a a. \n
+/// 01: assigned from bits [31:16] of \a a. \n
+/// 10: assigned from bits [47:32] of \a a. \n
+/// 11: assigned from bits [63:48] of \a a.
+/// \returns A 64-bit integer vector containing the shuffled values.
+#define _mm_shuffle_pi16(a, n) \
+ ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
+
+/// Conditionally copies the values from each 8-bit element in the first
+/// 64-bit integer vector operand to the specified memory location, as
+/// specified by the most significant bit in the corresponding element in the
+/// second 64-bit integer vector operand.
+///
+/// To minimize caching, the data is flagged as non-temporal
+/// (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
+///
+/// \param __d
+/// A 64-bit integer vector containing the values with elements to be copied.
+/// \param __n
+/// A 64-bit integer vector operand. The most significant bit from each 8-bit
+/// element determines whether the corresponding element in operand \a __d
+/// is copied. If the most significant bit of a given element is 1, the
+/// corresponding element in operand \a __d is copied.
+/// \param __p
+/// A pointer to a 64-bit memory location that will receive the conditionally
+/// copied integer values. The address of the memory location does not have
+/// to be aligned.
+static __inline__ void __DEFAULT_FN_ATTRS_MMX
+_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
+{
+ __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
+}
+
+/// Computes the rounded averages of the packed unsigned 8-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_avg_pu8(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
+}
+
+/// Computes the rounded averages of the packed unsigned 16-bit integer
+/// values and writes the averages to the corresponding bits in the
+/// destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector containing the averages of both operands.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_avg_pu16(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
+}
+
+/// Subtracts the corresponding 8-bit unsigned integer values of the two
+/// 64-bit vector operands and computes the absolute value for each of the
+/// difference. Then sum of the 8 absolute differences is written to the
+/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
+///
+/// \param __a
+/// A 64-bit integer vector containing one of the source operands.
+/// \param __b
+/// A 64-bit integer vector containing one of the source operands.
+/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
+/// sets of absolute differences between both operands. The upper bits are
+/// cleared.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_sad_pu8(__m64 __a, __m64 __b)
+{
+ return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
+}
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/// Returns the contents of the MXCSR register as a 32-bit unsigned
+/// integer value.
+///
+/// There are several groups of macros associated with this
+/// intrinsic, including:
+/// <ul>
+/// <li>
+/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
+/// _MM_GET_EXCEPTION_STATE().
+/// </li>
+/// <li>
+/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
+/// </li>
+/// <li>
+/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+/// _MM_GET_ROUNDING_MODE().
+/// </li>
+/// <li>
+/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
+/// </li>
+/// <li>
+/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+/// _MM_GET_DENORMALS_ZERO_MODE().
+/// </li>
+/// </ul>
+///
+/// For example, the following expression checks if an overflow exception has
+/// occurred:
+/// \code
+/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
+/// \endcode
+///
+/// The following expression gets the current rounding mode:
+/// \code
+/// _MM_GET_ROUNDING_MODE()
+/// \endcode
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
+///
+/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
+/// register.
+unsigned int _mm_getcsr(void);
+
+/// Sets the MXCSR register with the 32-bit unsigned integer value.
+///
+/// There are several groups of macros associated with this intrinsic,
+/// including:
+/// <ul>
+/// <li>
+/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
+/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
+/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
+/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
+/// </li>
+/// <li>
+/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
+/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
+/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
+/// of these macros.
+/// </li>
+/// <li>
+/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
+/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
+/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
+/// </li>
+/// <li>
+/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
+/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
+/// one of these macros.
+/// </li>
+/// <li>
+/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
+/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
+/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
+/// </li>
+/// </ul>
+///
+/// For example, the following expression causes subsequent floating-point
+/// operations to round up:
+/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
+///
+/// The following example sets the DAZ and FTZ flags:
+/// \code
+/// void setFlags() {
+/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+/// }
+/// \endcode
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
+///
+/// \param __i
+/// A 32-bit unsigned integer value to be written to the MXCSR register.
+void _mm_setcsr(unsigned int __i);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+/// Selects 4 float values from the 128-bit operands of [4 x float], as
+/// specified by the immediate value operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
+/// \endcode
+///
+/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
+///
+/// \param a
+/// A 128-bit vector of [4 x float].
+/// \param b
+/// A 128-bit vector of [4 x float].
+/// \param mask
+/// An immediate value containing an 8-bit value specifying which elements to
+/// copy from \a a and \a b. \n
+/// Bits [3:0] specify the values copied from operand \a a. \n
+/// Bits [7:4] specify the values copied from operand \a b. \n
+/// The destinations within the 128-bit destination are assigned values as
+/// follows: \n
+/// Bits [1:0] are used to assign values to bits [31:0] in the
+/// destination. \n
+/// Bits [3:2] are used to assign values to bits [63:32] in the
+/// destination. \n
+/// Bits [5:4] are used to assign values to bits [95:64] in the
+/// destination. \n
+/// Bits [7:6] are used to assign values to bits [127:96] in the
+/// destination. \n
+/// Bit value assignments: \n
+/// 00: Bits [31:0] copied from the specified operand. \n
+/// 01: Bits [63:32] copied from the specified operand. \n
+/// 10: Bits [95:64] copied from the specified operand. \n
+/// 11: Bits [127:96] copied from the specified operand.
+/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
+#define _mm_shuffle_ps(a, b, mask) \
+ ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
+ (int)(mask)))
+
+/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [95:64] are written to bits [31:0] of the destination. \n
+/// Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+/// A 128-bit vector of [4 x float].
+/// Bits [95:64] are written to bits [63:32] of the destination. \n
+/// Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpackhi_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
+}
+
+/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [31:0] are written to bits [31:0] of the destination. \n
+/// Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+/// A 128-bit vector of [4 x float]. \n
+/// Bits [31:0] are written to bits [63:32] of the destination. \n
+/// Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_unpacklo_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// 32 bits are set to the lower 32 bits of the second parameter. The upper
+/// 96 bits are set to the upper 96 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
+/// instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
+/// written to the upper 96 bits of the result.
+/// \param __b
+/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
+/// written to the lower 32 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_move_ss(__m128 __a, __m128 __b)
+{
+ __a[0] = __b[0];
+ return __a;
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// 64 bits are set to the upper 64 bits of the second parameter. The upper
+/// 64 bits are set to the upper 64 bits of the first parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+/// written to the upper 64 bits of the result.
+/// \param __b
+/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
+/// written to the lower 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movehl_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
+}
+
+/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
+/// 64 bits are set to the lower 64 bits of the first parameter. The upper
+/// 64 bits are set to the lower 64 bits of the second parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+/// written to the lower 64 bits of the result.
+/// \param __b
+/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
+/// written to the upper 64 bits of the result.
+/// \returns A 128-bit floating-point vector of [4 x float].
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_movelh_ps(__m128 __a, __m128 __b)
+{
+ return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
+}
+
+/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
+/// float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
+///
+/// \param __a
+/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
+/// from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+/// values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpi16_ps(__m64 __a)
+{
+ __m64 __b, __c;
+ __m128 __r;
+
+ __b = _mm_setzero_si64();
+ __b = _mm_cmpgt_pi16(__b, __a);
+ __c = _mm_unpackhi_pi16(__a, __b);
+ __r = _mm_setzero_ps();
+ __r = _mm_cvtpi32_ps(__r, __c);
+ __r = _mm_movelh_ps(__r, __r);
+ __c = _mm_unpacklo_pi16(__a, __b);
+ __r = _mm_cvtpi32_ps(__r, __c);
+
+ return __r;
+}
+
+/// Converts a 64-bit vector of 16-bit unsigned integer values into a
+/// 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
+///
+/// \param __a
+/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
+/// destination are copied from the corresponding elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+/// values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpu16_ps(__m64 __a)
+{
+ __m64 __b, __c;
+ __m128 __r;
+
+ __b = _mm_setzero_si64();
+ __c = _mm_unpackhi_pi16(__a, __b);
+ __r = _mm_setzero_ps();
+ __r = _mm_cvtpi32_ps(__r, __c);
+ __r = _mm_movelh_ps(__r, __r);
+ __c = _mm_unpacklo_pi16(__a, __b);
+ __r = _mm_cvtpi32_ps(__r, __c);
+
+ return __r;
+}
+
+/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
+/// into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
+///
+/// \param __a
+/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
+/// from the corresponding lower 4 elements in this operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+/// values from the operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpi8_ps(__m64 __a)
+{
+ __m64 __b;
+
+ __b = _mm_setzero_si64();
+ __b = _mm_cmpgt_pi8(__b, __a);
+ __b = _mm_unpacklo_pi8(__a, __b);
+
+ return _mm_cvtpi16_ps(__b);
+}
+
+/// Converts the lower four unsigned 8-bit integer values from a 64-bit
+/// vector of [8 x u8] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
+///
+/// \param __a
+/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
+/// destination are copied from the corresponding lower 4 elements in this
+/// operand.
+/// \returns A 128-bit vector of [4 x float] containing the copied and converted
+/// values from the source operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpu8_ps(__m64 __a)
+{
+ __m64 __b;
+
+ __b = _mm_setzero_si64();
+ __b = _mm_unpacklo_pi8(__a, __b);
+
+ return _mm_cvtpi16_ps(__b);
+}
+
+/// Converts the two 32-bit signed integer values from each 64-bit vector
+/// operand of [2 x i32] into a 128-bit vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
+///
+/// \param __a
+/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
+/// copied from the elements in this operand.
+/// \param __b
+/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
+/// copied from the elements in this operand.
+/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
+/// copied and converted values from the first operand. The upper 64 bits
+/// contain the copied and converted values from the second operand.
+static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
+{
+ __m128 __c;
+
+ __c = _mm_setzero_ps();
+ __c = _mm_cvtpi32_ps(__c, __b);
+ __c = _mm_movelh_ps(__c, __c);
+
+ return _mm_cvtpi32_ps(__c, __a);
+}
+
+/// Converts each single-precision floating-point element of a 128-bit
+/// floating-point vector of [4 x float] into a 16-bit signed integer, and
+/// packs the results into a 64-bit integer vector of [4 x i16].
+///
+/// If the floating-point element is NaN or infinity, or if the
+/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
+/// it is converted to 0x8000. Otherwise if the floating-point element is
+/// greater than 0x7FFF, it is converted to 0x7FFF.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [4 x i16] containing the converted
+/// values.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtps_pi16(__m128 __a)
+{
+ __m64 __b, __c;
+
+ __b = _mm_cvtps_pi32(__a);
+ __a = _mm_movehl_ps(__a, __a);
+ __c = _mm_cvtps_pi32(__a);
+
+ return _mm_packs_pi32(__b, __c);
+}
+
+/// Converts each single-precision floating-point element of a 128-bit
+/// floating-point vector of [4 x float] into an 8-bit signed integer, and
+/// packs the results into the lower 32 bits of a 64-bit integer vector of
+/// [8 x i8]. The upper 32 bits of the vector are set to 0.
+///
+/// If the floating-point element is NaN or infinity, or if the
+/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
+/// is converted to 0x80. Otherwise if the floating-point element is greater
+/// than 0x7F, it is converted to 0x7F.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
+///
+/// \param __a
+/// 128-bit floating-point vector of [4 x float].
+/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
+/// converted values and the uppper 32 bits are set to zero.
+static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
+_mm_cvtps_pi8(__m128 __a)
+{
+ __m64 __b, __c;
+
+ __b = _mm_cvtps_pi16(__a);
+ __c = _mm_setzero_si64();
+
+ return _mm_packs_pi16(__b, __c);
+}
+
+/// Extracts the sign bits from each single-precision floating-point
+/// element of a 128-bit floating-point vector of [4 x float] and returns the
+/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
+/// to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
+///
+/// \param __a
+/// A 128-bit floating-point vector of [4 x float].
+/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
+/// single-precision floating-point element of the parameter. Bits [31:4] are
+/// set to zero.
+static __inline__ int __DEFAULT_FN_ATTRS
+_mm_movemask_ps(__m128 __a)
+{
+ return __builtin_ia32_movmskps((__v4sf)__a);
+}
+
+
+#define _MM_ALIGN16 __attribute__((aligned(16)))
+
+#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
+
+#define _MM_EXCEPT_INVALID (0x0001U)
+#define _MM_EXCEPT_DENORM (0x0002U)
+#define _MM_EXCEPT_DIV_ZERO (0x0004U)
+#define _MM_EXCEPT_OVERFLOW (0x0008U)
+#define _MM_EXCEPT_UNDERFLOW (0x0010U)
+#define _MM_EXCEPT_INEXACT (0x0020U)
+#define _MM_EXCEPT_MASK (0x003fU)
+
+#define _MM_MASK_INVALID (0x0080U)
+#define _MM_MASK_DENORM (0x0100U)
+#define _MM_MASK_DIV_ZERO (0x0200U)
+#define _MM_MASK_OVERFLOW (0x0400U)
+#define _MM_MASK_UNDERFLOW (0x0800U)
+#define _MM_MASK_INEXACT (0x1000U)
+#define _MM_MASK_MASK (0x1f80U)
+
+#define _MM_ROUND_NEAREST (0x0000U)
+#define _MM_ROUND_DOWN (0x2000U)
+#define _MM_ROUND_UP (0x4000U)
+#define _MM_ROUND_TOWARD_ZERO (0x6000U)
+#define _MM_ROUND_MASK (0x6000U)
+
+#define _MM_FLUSH_ZERO_MASK (0x8000U)
+#define _MM_FLUSH_ZERO_ON (0x8000U)
+#define _MM_FLUSH_ZERO_OFF (0x0000U)
+
+#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
+#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
+#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
+#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
+
+#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
+#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
+#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
+#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
+
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+ __m128 tmp3, tmp2, tmp1, tmp0; \
+ tmp0 = _mm_unpacklo_ps((row0), (row1)); \
+ tmp2 = _mm_unpacklo_ps((row2), (row3)); \
+ tmp1 = _mm_unpackhi_ps((row0), (row1)); \
+ tmp3 = _mm_unpackhi_ps((row2), (row3)); \
+ (row0) = _mm_movelh_ps(tmp0, tmp2); \
+ (row1) = _mm_movehl_ps(tmp2, tmp0); \
+ (row2) = _mm_movelh_ps(tmp1, tmp3); \
+ (row3) = _mm_movehl_ps(tmp3, tmp1); \
+} while (0)
+
+/* Aliases for compatibility. */
+#define _m_pextrw _mm_extract_pi16
+#define _m_pinsrw _mm_insert_pi16
+#define _m_pmaxsw _mm_max_pi16
+#define _m_pmaxub _mm_max_pu8
+#define _m_pminsw _mm_min_pi16
+#define _m_pminub _mm_min_pu8
+#define _m_pmovmskb _mm_movemask_pi8
+#define _m_pmulhuw _mm_mulhi_pu16
+#define _m_pshufw _mm_shuffle_pi16
+#define _m_maskmovq _mm_maskmove_si64
+#define _m_pavgb _mm_avg_pu8
+#define _m_pavgw _mm_avg_pu16
+#define _m_psadbw _mm_sad_pu8
+#define _m_ _mm_
+#define _m_ _mm_
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS_MMX
+
+/* Ugly hack for backwards-compatibility (compatible with gcc) */
+#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
+#include <emmintrin.h>
+#endif
+
+#endif /* __XMMINTRIN_H */
--- /dev/null
+/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86INTRIN_H
+#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef __XOPINTRIN_H
+#define __XOPINTRIN_H
+
+#include <fma4intrin.h>
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epi8(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi8(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi8(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epi16(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi16(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epi32(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddw_epu8(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu8(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu8(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddd_epu16(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu16(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_haddq_epu32(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubw_epi8(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubd_epi16(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_hsubq_epi32(__m128i __A)
+{
+ return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_roti_epi8(A, N) \
+ ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)))
+
+#define _mm_roti_epi16(A, N) \
+ ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)))
+
+#define _mm_roti_epi32(A, N) \
+ ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)))
+
+#define _mm_roti_epi64(A, N) \
+ ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)))
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
+}
+
+#define _mm_com_epu8(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (N)))
+
+#define _mm_com_epu16(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B), (N)))
+
+#define _mm_com_epu32(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), (N)))
+
+#define _mm_com_epu64(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), (N)))
+
+#define _mm_com_epi8(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (N)))
+
+#define _mm_com_epi16(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B), (N)))
+
+#define _mm_com_epi32(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), (N)))
+
+#define _mm_com_epi64(A, B, N) \
+ ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B), (N)))
+
+#define _MM_PCOMCTRL_LT 0
+#define _MM_PCOMCTRL_LE 1
+#define _MM_PCOMCTRL_GT 2
+#define _MM_PCOMCTRL_GE 3
+#define _MM_PCOMCTRL_EQ 4
+#define _MM_PCOMCTRL_NEQ 5
+#define _MM_PCOMCTRL_FALSE 6
+#define _MM_PCOMCTRL_TRUE 7
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+ return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
+}
+
+#define _mm_permute2_pd(X, Y, C, I) \
+ ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__v2di)(__m128i)(C), (I)))
+
+#define _mm256_permute2_pd(X, Y, C, I) \
+ ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (__v4di)(__m256i)(C), (I)))
+
+#define _mm_permute2_ps(X, Y, C, I) \
+ ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
+ (__v4si)(__m128i)(C), (I)))
+
+#define _mm256_permute2_ps(X, Y, C, I) \
+ ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (__v8si)(__m256i)(C), (I)))
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ss(__m128 __A)
+{
+ return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_sd(__m128d __A)
+{
+ return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
+}
+
+static __inline__ __m128 __DEFAULT_FN_ATTRS
+_mm_frcz_ps(__m128 __A)
+{
+ return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
+}
+
+static __inline__ __m128d __DEFAULT_FN_ATTRS
+_mm_frcz_pd(__m128d __A)
+{
+ return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
+}
+
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_frcz_ps(__m256 __A)
+{
+ return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
+}
+
+static __inline__ __m256d __DEFAULT_FN_ATTRS256
+_mm256_frcz_pd(__m256d __A)
+{
+ return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS
+#undef __DEFAULT_FN_ATTRS256
+
+#endif /* __XOPINTRIN_H */
--- /dev/null
+/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVECINTRIN_H
+#define __XSAVECINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsavec")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsavec(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsavec64(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsavec64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEINTRIN_H
+#define __XSAVEINTRIN_H
+
+#ifdef _MSC_VER
+#define _XCR_XFEATURE_ENABLED_MASK 0
+#endif
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsave")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsave(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor(void *__p, unsigned long long __m) {
+ __builtin_ia32_xrstor(__p, __m);
+}
+
+#ifndef _MSC_VER
+#define _xgetbv(A) __builtin_ia32_xgetbv((long long)(A))
+#define _xsetbv(A, B) __builtin_ia32_xsetbv((unsigned int)(A), (unsigned long long)(B))
+#else
+#ifdef __cplusplus
+extern "C" {
+#endif
+unsigned __int64 __cdecl _xgetbv(unsigned int);
+void __cdecl _xsetbv(unsigned int, unsigned __int64);
+#ifdef __cplusplus
+}
+#endif
+#endif /* _MSC_VER */
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsave64(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsave64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstor64(void *__p, unsigned long long __m) {
+ __builtin_ia32_xrstor64(__p, __m);
+}
+
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVEOPTINTRIN_H
+#define __XSAVEOPTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaveopt")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsaveopt(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaveopt64(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsaveopt64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XSAVESINTRIN_H
+#define __XSAVESINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaves")))
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsaves(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors(void *__p, unsigned long long __m) {
+ __builtin_ia32_xrstors(__p, __m);
+}
+
+#ifdef __x86_64__
+static __inline__ void __DEFAULT_FN_ATTRS
+_xrstors64(void *__p, unsigned long long __m) {
+ __builtin_ia32_xrstors64(__p, __m);
+}
+
+static __inline__ void __DEFAULT_FN_ATTRS
+_xsaves64(void *__p, unsigned long long __m) {
+ __builtin_ia32_xsaves64(__p, __m);
+}
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+
+#endif
--- /dev/null
+/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __XTESTINTRIN_H
+#define __XTESTINTRIN_H
+
+/* xtest returns non-zero if the instruction is executed within an RTM or active
+ * HLE region. */
+/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
+ * supported. */
+static __inline__ int
+ __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
+ _xtest(void) {
+ return __builtin_ia32_xtest();
+}
+
+#endif
+++ /dev/null
-/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#error "Never use <__wmmintrin_aes.h> directly; include <wmmintrin.h> instead."
-#endif
-
-#ifndef __WMMINTRIN_AES_H
-#define __WMMINTRIN_AES_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128)))
-
-/// Performs a single round of AES encryption using the Equivalent
-/// Inverse Cipher, transforming the state value from the first source
-/// operand using a 128-bit round key value contained in the second source
-/// operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESENC </c> instruction.
-///
-/// \param __V
-/// A 128-bit integer vector containing the state value.
-/// \param __R
-/// A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the encrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesenc_si128(__m128i __V, __m128i __R)
-{
- return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs the final round of AES encryption using the Equivalent
-/// Inverse Cipher, transforming the state value from the first source
-/// operand using a 128-bit round key value contained in the second source
-/// operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESENCLAST </c> instruction.
-///
-/// \param __V
-/// A 128-bit integer vector containing the state value.
-/// \param __R
-/// A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the encrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesenclast_si128(__m128i __V, __m128i __R)
-{
- return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs a single round of AES decryption using the Equivalent
-/// Inverse Cipher, transforming the state value from the first source
-/// operand using a 128-bit round key value contained in the second source
-/// operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESDEC </c> instruction.
-///
-/// \param __V
-/// A 128-bit integer vector containing the state value.
-/// \param __R
-/// A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the decrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesdec_si128(__m128i __V, __m128i __R)
-{
- return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R);
-}
-
-/// Performs the final round of AES decryption using the Equivalent
-/// Inverse Cipher, transforming the state value from the first source
-/// operand using a 128-bit round key value contained in the second source
-/// operand, and writes the result to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESDECLAST </c> instruction.
-///
-/// \param __V
-/// A 128-bit integer vector containing the state value.
-/// \param __R
-/// A 128-bit integer vector containing the round key value.
-/// \returns A 128-bit integer vector containing the decrypted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesdeclast_si128(__m128i __V, __m128i __R)
-{
- return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R);
-}
-
-/// Applies the AES InvMixColumns() transformation to an expanded key
-/// contained in the source operand, and writes the result to the
-/// destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VAESIMC </c> instruction.
-///
-/// \param __V
-/// A 128-bit integer vector containing the expanded key.
-/// \returns A 128-bit integer vector containing the transformed value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_aesimc_si128(__m128i __V)
-{
- return (__m128i)__builtin_ia32_aesimc128((__v2di)__V);
-}
-
-/// Generates a round key for AES encryption, operating on 128-bit data
-/// specified in the first source operand and using an 8-bit round constant
-/// specified by the second source operand, and writes the result to the
-/// destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> AESKEYGENASSIST </c> instruction.
-///
-/// \param C
-/// A 128-bit integer vector that is used to generate the AES encryption key.
-/// \param R
-/// An 8-bit round constant used to generate the AES encryption key.
-/// \returns A 128-bit round key for AES encryption.
-#define _mm_aeskeygenassist_si128(C, R) \
- ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R)))
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __WMMINTRIN_AES_H */
+++ /dev/null
-/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#error "Never use <__wmmintrin_pclmul.h> directly; include <wmmintrin.h> instead."
-#endif
-
-#ifndef __WMMINTRIN_PCLMUL_H
-#define __WMMINTRIN_PCLMUL_H
-
-/// Multiplies two 64-bit integer values, which are selected from source
-/// operands using the immediate-value operand. The multiplication is a
-/// carry-less multiplication, and the 128-bit integer product is stored in
-/// the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCLMULQDQ </c> instruction.
-///
-/// \param __X
-/// A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __Y
-/// A 128-bit vector of [2 x i64] containing one of the source operands.
-/// \param __I
-/// An immediate value specifying which 64-bit values to select from the
-/// operands. Bit 0 is used to select a value from operand \a __X, and bit
-/// 4 is used to select a value from operand \a __Y: \n
-/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n
-/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n
-/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n
-/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used.
-/// \returns The 128-bit integer vector containing the result of the carry-less
-/// multiplication of the selected 64-bit values.
-#define _mm_clmulepi64_si128(X, Y, I) \
- ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \
- (__v2di)(__m128i)(Y), (char)(I)))
-
-#endif /* __WMMINTRIN_PCLMUL_H */
+++ /dev/null
-/*===---- adxintrin.h - ADX intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <adxintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __ADXINTRIN_H
-#define __ADXINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-
-/* Intrinsics that are available only if __ADX__ defined */
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
- unsigned int *__p)
-{
- return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx")))
-_addcarryx_u64(unsigned char __cf, unsigned long long __x,
- unsigned long long __y, unsigned long long *__p)
-{
- return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-/* Intrinsics that are also available if __ADX__ undefined */
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
- unsigned int *__p)
-{
- return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_addcarry_u64(unsigned char __cf, unsigned long long __x,
- unsigned long long __y, unsigned long long *__p)
-{
- return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p);
-}
-#endif
-
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y,
- unsigned int *__p)
-{
- return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p);
-}
-
-#ifdef __x86_64__
-static __inline unsigned char __DEFAULT_FN_ATTRS
-_subborrow_u64(unsigned char __cf, unsigned long long __x,
- unsigned long long __y, unsigned long long *__p)
-{
- return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __ADXINTRIN_H */
+++ /dev/null
-/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __AMMINTRIN_H
-#define __AMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <pmmintrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128)))
-
-/// Extracts the specified bits from the lower 64 bits of the 128-bit
-/// integer vector operand at the index \a idx and of the length \a len.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
-///
-/// \param x
-/// The value from which bits are extracted.
-/// \param len
-/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
-/// are zero, the length is interpreted as 64.
-/// \param idx
-/// Bits [5:0] specify the index of the least significant bit; the other
-/// bits are ignored. If the sum of the index and length is greater than 64,
-/// the result is undefined. If the length and index are both zero, bits
-/// [63:0] of parameter \a x are extracted. If the length is zero but the
-/// index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector whose lower 64 bits contain the bits
-/// extracted from the source operand.
-#define _mm_extracti_si64(x, len, idx) \
- ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \
- (char)(len), (char)(idx)))
-
-/// Extracts the specified bits from the lower 64 bits of the 128-bit
-/// integer vector operand at the index and of the length specified by
-/// \a __y.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> EXTRQ </c> instruction.
-///
-/// \param __x
-/// The value from which bits are extracted.
-/// \param __y
-/// Specifies the index of the least significant bit at [13:8] and the
-/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the
-/// length is interpreted as 64. If the sum of the index and length is
-/// greater than 64, the result is undefined. If the length and index are
-/// both zero, bits [63:0] of parameter \a __x are extracted. If the length
-/// is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted
-/// from the source operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_extract_si64(__m128i __x, __m128i __y)
-{
- return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y);
-}
-
-/// Inserts bits of a specified length from the source integer vector
-/// \a y into the lower 64 bits of the destination integer vector \a x at
-/// the index \a idx and of the length \a len.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len,
-/// const int idx);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
-///
-/// \param x
-/// The destination operand where bits will be inserted. The inserted bits
-/// are defined by the length \a len and by the index \a idx specifying the
-/// least significant bit.
-/// \param y
-/// The source operand containing the bits to be extracted. The extracted
-/// bits are the least significant bits of operand \a y of length \a len.
-/// \param len
-/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0]
-/// are zero, the length is interpreted as 64.
-/// \param idx
-/// Bits [5:0] specify the index of the least significant bit; the other
-/// bits are ignored. If the sum of the index and length is greater than 64,
-/// the result is undefined. If the length and index are both zero, bits
-/// [63:0] of parameter \a y are inserted into parameter \a x. If the length
-/// is zero but the index is non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits of
-/// destination operand \a x with the specified bitfields replaced by the
-/// lower bits of source operand \a y. The upper 64 bits of the return value
-/// are undefined.
-#define _mm_inserti_si64(x, y, len, idx) \
- ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \
- (__v2di)(__m128i)(y), \
- (char)(len), (char)(idx)))
-
-/// Inserts bits of a specified length from the source integer vector
-/// \a __y into the lower 64 bits of the destination integer vector \a __x
-/// at the index and of the length specified by \a __y.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> INSERTQ </c> instruction.
-///
-/// \param __x
-/// The destination operand where bits will be inserted. The inserted bits
-/// are defined by the length and by the index of the least significant bit
-/// specified by operand \a __y.
-/// \param __y
-/// The source operand containing the bits to be extracted. The extracted
-/// bits are the least significant bits of operand \a __y with length
-/// specified by bits [69:64]. These are inserted into the destination at the
-/// index specified by bits [77:72]; all other bits are ignored. If bits
-/// [69:64] are zero, the length is interpreted as 64. If the sum of the
-/// index and length is greater than 64, the result is undefined. If the
-/// length and index are both zero, bits [63:0] of parameter \a __y are
-/// inserted into parameter \a __x. If the length is zero but the index is
-/// non-zero, the result is undefined.
-/// \returns A 128-bit integer vector containing the original lower 64-bits of
-/// destination operand \a __x with the specified bitfields replaced by the
-/// lower bits of source operand \a __y. The upper 64 bits of the return
-/// value are undefined.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_insert_si64(__m128i __x, __m128i __y)
-{
- return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y);
-}
-
-/// Stores a 64-bit double-precision value in a 64-bit memory location.
-/// To minimize caching, the data is flagged as non-temporal (unlikely to be
-/// used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTSD </c> instruction.
-///
-/// \param __p
-/// The 64-bit memory location used to store the register value.
-/// \param __a
-/// The 64-bit double-precision floating-point register value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_sd(double *__p, __m128d __a)
-{
- __builtin_ia32_movntsd(__p, (__v2df)__a);
-}
-
-/// Stores a 32-bit single-precision floating-point value in a 32-bit
-/// memory location. To minimize caching, the data is flagged as
-/// non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTSS </c> instruction.
-///
-/// \param __p
-/// The 32-bit memory location used to store the register value.
-/// \param __a
-/// The 32-bit single-precision floating-point register value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ss(float *__p, __m128 __a)
-{
- __builtin_ia32_movntss(__p, (__v4sf)__a);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __AMMINTRIN_H */
+++ /dev/null
-/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===------------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
-#endif /* __IMMINTRIN_H */
-
-#ifndef __AMXINTRIN_H
-#define __AMXINTRIN_H
-#ifdef __x86_64__
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS_TILE \
- __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
-#define __DEFAULT_FN_ATTRS_INT8 \
- __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
-#define __DEFAULT_FN_ATTRS_BF16 \
- __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
-
-/// Load tile configuration from a 64-byte memory location specified by
-/// "mem_addr". The tile configuration includes the tile type palette, the
-/// number of bytes per row, and the number of rows. If the specified
-/// palette_id is zero, that signifies the init state for both the tile
-/// config and the tile data, and the tiles are zeroed. Any invalid
-/// configurations will result in #GP fault.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
-///
-/// \param __config
-/// A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS_TILE
-_tile_loadconfig(const void *__config) {
- __builtin_ia32_tile_loadconfig(__config);
-}
-
-/// Stores the current tile configuration to a 64-byte memory location
-/// specified by "mem_addr". The tile configuration includes the tile type
-/// palette, the number of bytes per row, and the number of rows. If tiles
-/// are not configured, all zeroes will be stored to memory.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
-///
-/// \param __config
-/// A pointer to 512-bits configuration
-static __inline__ void __DEFAULT_FN_ATTRS_TILE
-_tile_storeconfig(void *__config) {
- __builtin_ia32_tile_storeconfig(__config);
-}
-
-/// Release the tile configuration to return to the init state, which
-/// releases all storage it currently holds.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
- __builtin_ia32_tilerelease();
-}
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst" using the tile configuration previously configured
-/// via "_tile_loadconfig".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
-///
-/// \param dst
-/// A destination tile. Max size is 1024 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-#define _tile_loadd(dst, base, stride) \
- __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
- (__SIZE_TYPE__)(stride))
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst" using the tile configuration previously configured
-/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
-/// that the data will likely not be reused in the near future and the data
-/// caching can be optimized accordingly.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
-///
-/// \param dst
-/// A destination tile. Max size is 1024 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-#define _tile_stream_loadd(dst, base, stride) \
- __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
- (__SIZE_TYPE__)(stride))
-
-/// Store the tile specified by "src" to memory specifieid by "base" address and
-/// "stride" using the tile configuration previously configured via
-/// "_tile_loadconfig".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
-///
-/// \param dst
-/// A destination tile. Max size is 1024 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be stored in memory.
-#define _tile_stored(dst, base, stride) \
- __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
-
-/// Zero the tile specified by "tdest".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
-///
-/// \param tile
-/// The destination tile to be zero. Max size is 1024 Bytes.
-#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbssd(dst, src0, src1) \
- __builtin_ia32_tdpbssd((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbsud(dst, src0, src1) \
- __builtin_ia32_tdpbsud((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbusd(dst, src0, src1) \
- __builtin_ia32_tdpbusd((dst), (src0), (src1))
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
-/// "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbuud(dst, src0, src1) \
- __builtin_ia32_tdpbuud((dst), (src0), (src1))
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf16ps(dst, src0, src1) \
- __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
-
-/// AMX tile register size can be configured, the maximum size is 16x64=1024
-/// bytes. Since there is no 2D type in llvm IR, we use vector type to
-/// represent 2D tile and the fixed size is maximum amx tile register size.
-typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
- __SIZE_TYPE__ stride) {
- return __builtin_ia32_tileloadd64_internal(m, n, base,
- (__SIZE_TYPE__)(stride));
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
- __SIZE_TYPE__ stride) {
- return __builtin_ia32_tileloaddt164_internal(m, n, base,
- (__SIZE_TYPE__)(stride));
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
-_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ void __DEFAULT_FN_ATTRS_INT8
-_tile_stored_internal(unsigned short m, unsigned short n, void *base,
- __SIZE_TYPE__ stride, _tile1024i tile) {
- return __builtin_ia32_tilestored64_internal(m, n, base,
- (__SIZE_TYPE__)(stride), tile);
-}
-
-/// This is internal intrinsic. C/C++ user should avoid calling it directly.
-static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
-_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
- _tile1024i dst, _tile1024i src1, _tile1024i src2) {
- return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
-}
-
-/// This struct pack the shape and tile data together for user. We suggest
-/// initializing the struct as early as possible, because compiler depends
-/// on the shape information to do configure. The constant value is preferred
-/// for optimization by compiler.
-typedef struct __tile1024i_str {
- const unsigned short row;
- const unsigned short col;
- _tile1024i tile;
-} __tile1024i;
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
-///
-/// \param dst
-/// A destination tile. Max size is 1024 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
- __SIZE_TYPE__ stride) {
- dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
-}
-
-/// Load tile rows from memory specifieid by "base" address and "stride" into
-/// destination tile "dst". This intrinsic provides a hint to the implementation
-/// that the data will likely not be reused in the near future and the data
-/// caching can be optimized accordingly.
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
-///
-/// \param dst
-/// A destination tile. Max size is 1024 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be loaded in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
- __SIZE_TYPE__ stride) {
- dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
-/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
-/// and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
-/// Compute dot-product of bytes in tiles with a source/destination accumulator.
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
-/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
-/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
-/// "dst", and store the 32-bit result back to tile "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_INT8
-static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
-/// Store the tile specified by "src" to memory specifieid by "base" address and
-/// "stride".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
-///
-/// \param dst
-/// A destination tile. Max size is 1024 Bytes.
-/// \param base
-/// A pointer to base address.
-/// \param stride
-/// The stride between the rows' data to be stored in memory.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
- __tile1024i src) {
- _tile_stored_internal(src.row, src.col, base, stride, src.tile);
-}
-
-/// Zero the tile specified by "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
-///
-/// \param dst
-/// The destination tile to be zero. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_TILE
-static __inline__ void __tile_zero(__tile1024i *dst) {
- dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
-}
-
-/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
-/// src1, accumulating the intermediate single-precision (32-bit) floating-point
-/// elements with elements in "dst", and store the 32-bit result back to tile
-/// "dst".
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
-///
-/// \param dst
-/// The destination tile. Max size is 1024 Bytes.
-/// \param src0
-/// The 1st source tile. Max size is 1024 Bytes.
-/// \param src1
-/// The 2nd source tile. Max size is 1024 Bytes.
-__DEFAULT_FN_ATTRS_BF16
-static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
- __tile1024i src1) {
- dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
- src0.tile, src1.tile);
-}
-
-#undef __DEFAULT_FN_ATTRS_TILE
-#undef __DEFAULT_FN_ATTRS_INT8
-#undef __DEFAULT_FN_ATTRS_BF16
-
-#endif /* __x86_64__ */
-#endif /* __AMXINTRIN_H */
+++ /dev/null
-/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX2INTRIN_H
-#define __AVX2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128)))
-
-/* SSE4 Multiple Packed Sums of Absolute Difference. */
-#define _mm256_mpsadbw_epu8(X, Y, M) \
- ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
- (__v32qi)(__m256i)(Y), (int)(M)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi8(__m256i __a)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
-#else
- return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi16(__m256i __a)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
-#else
- return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi32(__m256i __a)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
-#else
- return (__m256i)__builtin_elementwise_abs((__v8si)__a);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
- return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi8(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v32qu)__a + (__v32qu)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v16hu)__a + (__v16hu)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8su)__a + (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a + (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epi8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b);
-#else
- return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epi16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b);
-#else
- return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epu8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b);
-#else
- return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_adds_epu16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b);
-#else
- return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
-#endif
-}
-
-#define _mm256_alignr_epi8(a, b, n) \
- ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
- (__v32qi)(__m256i)(b), (n)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_and_si256(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a & (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_andnot_si256(__m256i __a, __m256i __b)
-{
- return (__m256i)(~(__v4du)__a & (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_avg_epu8(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_avg_epu16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
-{
- return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
- (__v32qi)__M);
-}
-
-#define _mm256_blend_epi16(V1, V2, M) \
- ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
- (__v16hi)(__m256i)(V2), (int)(M)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v32qi)__a == (__v32qi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v16hi)__a == (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8si)__a == (__v8si)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4di)__a == (__v4di)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
-{
- /* This function always performs a signed comparison, but __v32qi is a char
- which may be signed or unsigned, so use __v32qs. */
- return (__m256i)((__v32qs)__a > (__v32qs)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v16hi)__a > (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8si)__a > (__v8si)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4di)__a > (__v4di)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsubs_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maddubs_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
-#else
- return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
-#else
- return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi32(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
-#else
- return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
-#else
- return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
-#else
- return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu32(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
-#else
- return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
-#else
- return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
-#else
- return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi32(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
-#else
- return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
-#else
- return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
-#else
- return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu32(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
-#else
- return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS256
-_mm256_movemask_epi8(__m256i __a)
-{
- return __builtin_ia32_pmovmskb256((__v32qi)__a);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi8_epi16(__m128i __V)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi8_epi32(__m128i __V)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi8_epi64(__m128i __V)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_epi32(__m128i __V)
-{
- return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_epi64(__m128i __V)
-{
- return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi64(__m128i __V)
-{
- return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu8_epi16(__m128i __V)
-{
- return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu8_epi32(__m128i __V)
-{
- return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu8_epi64(__m128i __V)
-{
- return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu16_epi32(__m128i __V)
-{
- return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu16_epi64(__m128i __V)
-{
- return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_epi64(__m128i __V)
-{
- return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhi_epu16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhi_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v16hu)__a * (__v16hu)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi32 (__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8su)__a * (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mul_epu32(__m256i __a, __m256i __b)
-{
- return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_or_si256(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a | (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sad_epu8(__m256i __a, __m256i __b)
-{
- return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shuffle_epi8(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
-}
-
-#define _mm256_shuffle_epi32(a, imm) \
- ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
-
-#define _mm256_shufflehi_epi16(a, imm) \
- ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
-
-#define _mm256_shufflelo_epi16(a, imm) \
- ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi8(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
-}
-
-#define _mm256_slli_si256(a, imm) \
- ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
-
-#define _mm256_bslli_epi128(a, imm) \
- ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi16(__m256i __a, int __count)
-{
- return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi16(__m256i __a, __m128i __count)
-{
- return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi32(__m256i __a, int __count)
-{
- return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi32(__m256i __a, __m128i __count)
-{
- return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi64(__m256i __a, int __count)
-{
- return __builtin_ia32_psllqi256((__v4di)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi64(__m256i __a, __m128i __count)
-{
- return __builtin_ia32_psllq256((__v4di)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi16(__m256i __a, int __count)
-{
- return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi16(__m256i __a, __m128i __count)
-{
- return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi32(__m256i __a, int __count)
-{
- return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi32(__m256i __a, __m128i __count)
-{
- return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
-}
-
-#define _mm256_srli_si256(a, imm) \
- ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
-
-#define _mm256_bsrli_epi128(a, imm) \
- ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi16(__m256i __a, int __count)
-{
- return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi16(__m256i __a, __m128i __count)
-{
- return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi32(__m256i __a, int __count)
-{
- return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi32(__m256i __a, __m128i __count)
-{
- return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi64(__m256i __a, int __count)
-{
- return __builtin_ia32_psrlqi256((__v4di)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi64(__m256i __a, __m128i __count)
-{
- return __builtin_ia32_psrlq256((__v4di)__a, __count);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi8(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v32qu)__a - (__v32qu)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v16hu)__a - (__v16hu)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8su)__a - (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a - (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epi8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b);
-#else
- return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epi16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b);
-#else
- return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epu8(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b);
-#else
- return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_subs_epu16(__m256i __a, __m256i __b)
-{
-#if (__clang_major__ > 14)
- return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b);
-#else
- return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_xor_si256(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a ^ (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(__m256i const *__V)
-{
- typedef __v4di __v4di_aligned __attribute__((aligned(32)));
- return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_broadcastss_ps(__m128 __X)
-{
- return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_broadcastsd_pd(__m128d __a)
-{
- return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcastss_ps(__m128 __X)
-{
- return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_broadcastsd_pd(__m128d __X)
-{
- return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastsi128_si256(__m128i __X)
-{
- return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
-}
-
-#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
-
-#define _mm_blend_epi32(V1, V2, M) \
- ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
- (__v4si)(__m128i)(V2), (int)(M)))
-
-#define _mm256_blend_epi32(V1, V2, M) \
- ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
- (__v8si)(__m256i)(V2), (int)(M)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastb_epi8(__m128i __X)
-{
- return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastw_epi16(__m128i __X)
-{
- return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastd_epi32(__m128i __X)
-{
- return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastq_epi64(__m128i __X)
-{
- return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastb_epi8(__m128i __X)
-{
- return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastw_epi16(__m128i __X)
-{
- return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastd_epi32(__m128i __X)
-{
- return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastq_epi64(__m128i __X)
-{
- return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
-}
-
-#define _mm256_permute4x64_pd(V, M) \
- ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
-{
- return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
-}
-
-#define _mm256_permute4x64_epi64(V, M) \
- ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
-
-#define _mm256_permute2x128_si256(V1, V2, M) \
- ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
-
-#define _mm256_extracti128_si256(V, M) \
- ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
-
-#define _mm256_inserti128_si256(V1, V2, M) \
- ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
- (__v2di)(__m128i)(V2), (int)(M)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi32(int const *__X, __m256i __M)
-{
- return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi64(long long const *__X, __m256i __M)
-{
- return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi32(int const *__X, __m128i __M)
-{
- return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi64(long long const *__X, __m128i __M)
-{
- return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
-{
- __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
-{
- __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
-{
- __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
-{
- __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sllv_epi32(__m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sllv_epi32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sllv_epi64(__m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sllv_epi64(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srav_epi32(__m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srav_epi32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srlv_epi32(__m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srlv_epi32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srlv_epi64(__m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srlv_epi64(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
-}
-
-#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
- ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
- (double const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v2df)(__m128d)(mask), (s)))
-
-#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
- ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
- (double const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v4df)(__m256d)(mask), (s)))
-
-#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
- ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
- (double const *)(m), \
- (__v2di)(__m128i)(i), \
- (__v2df)(__m128d)(mask), (s)))
-
-#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
- ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
- (double const *)(m), \
- (__v4di)(__m256i)(i), \
- (__v4df)(__m256d)(mask), (s)))
-
-#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
- ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
- (float const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v4sf)(__m128)(mask), (s)))
-
-#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
- ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
- (float const *)(m), \
- (__v8si)(__m256i)(i), \
- (__v8sf)(__m256)(mask), (s)))
-
-#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
- ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
- (float const *)(m), \
- (__v2di)(__m128i)(i), \
- (__v4sf)(__m128)(mask), (s)))
-
-#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
- ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
- (float const *)(m), \
- (__v4di)(__m256i)(i), \
- (__v4sf)(__m128)(mask), (s)))
-
-#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
- ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
- (int const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v4si)(__m128i)(mask), (s)))
-
-#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
- ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
- (int const *)(m), \
- (__v8si)(__m256i)(i), \
- (__v8si)(__m256i)(mask), (s)))
-
-#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
- ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
- (int const *)(m), \
- (__v2di)(__m128i)(i), \
- (__v4si)(__m128i)(mask), (s)))
-
-#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
- ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
- (int const *)(m), \
- (__v4di)(__m256i)(i), \
- (__v4si)(__m128i)(mask), (s)))
-
-#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
- ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
- (long long const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v2di)(__m128i)(mask), (s)))
-
-#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
- ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
- (long long const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v4di)(__m256i)(mask), (s)))
-
-#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
- ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
- (long long const *)(m), \
- (__v2di)(__m128i)(i), \
- (__v2di)(__m128i)(mask), (s)))
-
-#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
- ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
- (long long const *)(m), \
- (__v4di)(__m256i)(i), \
- (__v4di)(__m256i)(mask), (s)))
-
-#define _mm_i32gather_pd(m, i, s) \
- ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
- (double const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
- _mm_setzero_pd()), \
- (s)))
-
-#define _mm256_i32gather_pd(m, i, s) \
- ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
- (double const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
- _mm256_setzero_pd(), \
- _CMP_EQ_OQ), \
- (s)))
-
-#define _mm_i64gather_pd(m, i, s) \
- ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
- (double const *)(m), \
- (__v2di)(__m128i)(i), \
- (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
- _mm_setzero_pd()), \
- (s)))
-
-#define _mm256_i64gather_pd(m, i, s) \
- ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
- (double const *)(m), \
- (__v4di)(__m256i)(i), \
- (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
- _mm256_setzero_pd(), \
- _CMP_EQ_OQ), \
- (s)))
-
-#define _mm_i32gather_ps(m, i, s) \
- ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
- (float const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
- _mm_setzero_ps()), \
- (s)))
-
-#define _mm256_i32gather_ps(m, i, s) \
- ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
- (float const *)(m), \
- (__v8si)(__m256i)(i), \
- (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
- _mm256_setzero_ps(), \
- _CMP_EQ_OQ), \
- (s)))
-
-#define _mm_i64gather_ps(m, i, s) \
- ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
- (float const *)(m), \
- (__v2di)(__m128i)(i), \
- (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
- _mm_setzero_ps()), \
- (s)))
-
-#define _mm256_i64gather_ps(m, i, s) \
- ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
- (float const *)(m), \
- (__v4di)(__m256i)(i), \
- (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
- _mm_setzero_ps()), \
- (s)))
-
-#define _mm_i32gather_epi32(m, i, s) \
- ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
- (int const *)(m), (__v4si)(__m128i)(i), \
- (__v4si)_mm_set1_epi32(-1), (s)))
-
-#define _mm256_i32gather_epi32(m, i, s) \
- ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
- (int const *)(m), (__v8si)(__m256i)(i), \
- (__v8si)_mm256_set1_epi32(-1), (s)))
-
-#define _mm_i64gather_epi32(m, i, s) \
- ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
- (int const *)(m), (__v2di)(__m128i)(i), \
- (__v4si)_mm_set1_epi32(-1), (s)))
-
-#define _mm256_i64gather_epi32(m, i, s) \
- ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
- (int const *)(m), (__v4di)(__m256i)(i), \
- (__v4si)_mm_set1_epi32(-1), (s)))
-
-#define _mm_i32gather_epi64(m, i, s) \
- ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
- (long long const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v2di)_mm_set1_epi64x(-1), (s)))
-
-#define _mm256_i32gather_epi64(m, i, s) \
- ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
- (long long const *)(m), \
- (__v4si)(__m128i)(i), \
- (__v4di)_mm256_set1_epi64x(-1), (s)))
-
-#define _mm_i64gather_epi64(m, i, s) \
- ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
- (long long const *)(m), \
- (__v2di)(__m128i)(i), \
- (__v2di)_mm_set1_epi64x(-1), (s)))
-
-#define _mm256_i64gather_epi64(m, i, s) \
- ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
- (long long const *)(m), \
- (__v4di)(__m256i)(i), \
- (__v4di)_mm256_set1_epi64x(-1), (s)))
-
-#undef __DEFAULT_FN_ATTRS256
-#undef __DEFAULT_FN_ATTRS128
-
-#endif /* __AVX2INTRIN_H */
+++ /dev/null
-/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512BF16INTRIN_H
-#define __AVX512BF16INTRIN_H
-
-#if (__clang_major__ > 15)
-typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
-typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
-typedef __bf16 __bfloat16;
-#else
-typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
-typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
-typedef unsigned short __bfloat16;
-#endif
-
-#define __DEFAULT_FN_ATTRS512 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
- __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
-
-/// Convert One BF16 Data to One Single Float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __A
-/// A bfloat data.
-/// \returns A float data whose sign field and exponent field keep unchanged,
-/// and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
- return __builtin_ia32_cvtsbf162ss_32(__A);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [16 x float].
-/// \param __B
-/// A 512-bit vector of [16 x float].
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-/// conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
- return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
- (__v16sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [16 x float].
-/// \param __B
-/// A 512-bit vector of [16 x float].
-/// \param __W
-/// A 512-bit vector of [32 x bfloat].
-/// \param __U
-/// A 32-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-/// conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
- return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
- (__v32hi)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [16 x float].
-/// \param __B
-/// A 512-bit vector of [16 x float].
-/// \param __U
-/// A 32-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
-/// conversion of __B, and higher 256 bits come from conversion of __A.
-static __inline__ __m512bh __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
- return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [16 x float].
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_cvtneps_pbh(__m512 __A) {
- return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
- (__v16hi)_mm256_undefined_si256(),
- (__mmask16)-1);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [16 x float].
-/// \param __W
-/// A 256-bit vector of [16 x bfloat].
-/// \param __U
-/// A 16-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
- return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
- (__v16hi)__W,
- (__mmask16)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [16 x float].
-/// \param __U
-/// A 16-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
- return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
- (__v16hi)_mm256_setzero_si256(),
- (__mmask16)__U);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [32 x bfloat].
-/// \param __B
-/// A 512-bit vector of [32 x bfloat].
-/// \param __D
-/// A 512-bit vector of [16 x float].
-/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
- return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
- (__v16si) __A,
- (__v16si) __B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [32 x bfloat].
-/// \param __B
-/// A 512-bit vector of [32 x bfloat].
-/// \param __D
-/// A 512-bit vector of [16 x float].
-/// \param __U
-/// A 16-bit mask value specifying what is chosen for each element.
-/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
- (__v16sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 512-bit vector of [32 x bfloat].
-/// \param __B
-/// A 512-bit vector of [32 x bfloat].
-/// \param __D
-/// A 512-bit vector of [16 x float].
-/// \param __U
-/// A 16-bit mask value specifying what is chosen for each element.
-/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
- (__v16sf)_mm512_setzero_si512());
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-/// A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-/// A 16-bit mask. Elements are zeroed out when the corresponding mask
-/// bit is not set.
-/// \param __A
-/// A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
- (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-/// A 512-bit vector of [16 x float]. Elements are copied from __S when
-/// the corresponding mask bit is not set.
-/// \param __U
-/// A 16-bit mask.
-/// \param __A
-/// A 256-bit vector of [16 x bfloat].
-/// \returns A 512-bit vector of [16 x float] come from conversion of __A
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
- return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
- (__m512i)__S, (__mmask16)__U,
- (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS512
-
-#endif
+++ /dev/null
-/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bitalgintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512BITALGINTRIN_H
-#define __AVX512BITALGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_popcnt_epi16(__m512i __A)
-{
- return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B)
-{
- return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U,
- (__v32hi) _mm512_popcnt_epi16(__B),
- (__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B)
-{
- return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(),
- __U,
- __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_popcnt_epi8(__m512i __A)
-{
- return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B)
-{
- return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U,
- (__v64qi) _mm512_popcnt_epi8(__B),
- (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B)
-{
- return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(),
- __U,
- __B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A,
- (__v64qi) __B,
- __U);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
-{
- return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1,
- __A,
- __B);
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512BWINTRIN_H
-#define __AVX512BWINTRIN_H
-
-typedef unsigned int __mmask32;
-typedef unsigned long long __mmask64;
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw")))
-
-static __inline __mmask32 __DEFAULT_FN_ATTRS
-_knot_mask32(__mmask32 __M)
-{
- return __builtin_ia32_knotsi(__M);
-}
-
-static __inline __mmask64 __DEFAULT_FN_ATTRS
-_knot_mask64(__mmask64 __M)
-{
- return __builtin_ia32_knotdi(__M);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kand_mask32(__mmask32 __A, __mmask32 __B)
-{
- return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kand_mask64(__mmask64 __A, __mmask64 __B)
-{
- return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kandn_mask32(__mmask32 __A, __mmask32 __B)
-{
- return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kandn_mask64(__mmask64 __A, __mmask64 __B)
-{
- return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kor_mask32(__mmask32 __A, __mmask32 __B)
-{
- return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kor_mask64(__mmask64 __A, __mmask64 __B)
-{
- return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kxnor_mask32(__mmask32 __A, __mmask32 __B)
-{
- return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kxnor_mask64(__mmask64 __A, __mmask64 __B)
-{
- return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kxor_mask32(__mmask32 __A, __mmask32 __B)
-{
- return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kxor_mask64(__mmask64 __A, __mmask64 __B)
-{
- return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
- return (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
- return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B);
- return (unsigned char)__builtin_ia32_kortestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
- return (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
- return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B);
- return (unsigned char)__builtin_ia32_kortestzdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
- return (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B)
-{
- return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B);
- return (unsigned char)__builtin_ia32_ktestzsi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
- return (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B)
-{
- return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B);
- return (unsigned char)__builtin_ia32_ktestzdi(__A, __B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_kadd_mask32(__mmask32 __A, __mmask32 __B)
-{
- return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_kadd_mask64(__mmask64 __A, __mmask64 __B)
-{
- return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B);
-}
-
-#define _kshiftli_mask32(A, I) \
- ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask32(A, I) \
- ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I)))
-
-#define _kshiftli_mask64(A, I) \
- ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask64(A, I) \
- ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_cvtmask32_u32(__mmask32 __A) {
- return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_cvtmask64_u64(__mmask64 __A) {
- return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_cvtu32_mask32(unsigned int __A) {
- return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_cvtu64_mask64(unsigned long long __A) {
- return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_load_mask32(__mmask32 *__A) {
- return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_load_mask64(__mmask64 *__A) {
- return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask32(__mmask32 *__A, __mmask32 __B) {
- *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask64(__mmask64 *__A, __mmask64 __B) {
- *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B);
-}
-
-/* Integer compare */
-
-#define _mm512_cmp_epi8_mask(a, b, p) \
- ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
- (__v64qi)(__m512i)(b), (int)(p), \
- (__mmask64)-1))
-
-#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \
- ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \
- (__v64qi)(__m512i)(b), (int)(p), \
- (__mmask64)(m)))
-
-#define _mm512_cmp_epu8_mask(a, b, p) \
- ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
- (__v64qi)(__m512i)(b), (int)(p), \
- (__mmask64)-1))
-
-#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \
- ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \
- (__v64qi)(__m512i)(b), (int)(p), \
- (__mmask64)(m)))
-
-#define _mm512_cmp_epi16_mask(a, b, p) \
- ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
- (__v32hi)(__m512i)(b), (int)(p), \
- (__mmask32)-1))
-
-#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \
- ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \
- (__v32hi)(__m512i)(b), (int)(p), \
- (__mmask32)(m)))
-
-#define _mm512_cmp_epu16_mask(a, b, p) \
- ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
- (__v32hi)(__m512i)(b), (int)(p), \
- (__mmask32)-1))
-
-#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \
- ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \
- (__v32hi)(__m512i)(b), (int)(p), \
- (__mmask32)(m)))
-
-#define _mm512_cmpeq_epi8_mask(A, B) \
- _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \
- _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi8_mask(A, B) \
- _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi8_mask(k, A, B) \
- _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi8_mask(A, B) \
- _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \
- _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi8_mask(A, B) \
- _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi8_mask(k, A, B) \
- _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi8_mask(A, B) \
- _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi8_mask(k, A, B) \
- _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi8_mask(A, B) \
- _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \
- _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu8_mask(A, B) \
- _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \
- _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu8_mask(A, B) \
- _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu8_mask(k, A, B) \
- _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu8_mask(A, B) \
- _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \
- _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu8_mask(A, B) \
- _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu8_mask(k, A, B) \
- _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu8_mask(A, B) \
- _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu8_mask(k, A, B) \
- _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu8_mask(A, B) \
- _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \
- _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epi16_mask(A, B) \
- _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \
- _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi16_mask(A, B) \
- _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi16_mask(k, A, B) \
- _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi16_mask(A, B) \
- _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \
- _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi16_mask(A, B) \
- _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi16_mask(k, A, B) \
- _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi16_mask(A, B) \
- _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi16_mask(k, A, B) \
- _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi16_mask(A, B) \
- _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \
- _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu16_mask(A, B) \
- _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \
- _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu16_mask(A, B) \
- _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu16_mask(k, A, B) \
- _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu16_mask(A, B) \
- _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \
- _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu16_mask(A, B) \
- _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu16_mask(k, A, B) \
- _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu16_mask(A, B) \
- _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu16_mask(k, A, B) \
- _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu16_mask(A, B) \
- _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \
- _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi8 (__m512i __A, __m512i __B) {
- return (__m512i) ((__v64qu) __A + (__v64qu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_add_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_add_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi8 (__m512i __A, __m512i __B) {
- return (__m512i) ((__v64qu) __A - (__v64qu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_sub_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_sub_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi16 (__m512i __A, __m512i __B) {
- return (__m512i) ((__v32hu) __A + (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_add_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_add_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi16 (__m512i __A, __m512i __B) {
- return (__m512i) ((__v32hu) __A - (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sub_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sub_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullo_epi16 (__m512i __A, __m512i __B) {
- return (__m512i) ((__v32hu) __A * (__v32hu) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mullo_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mullo_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
-{
- return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
- (__v64qi) __W,
- (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
-{
- return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
- (__v32hi) __W,
- (__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi8 (__m512i __A)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A);
-#else
- return (__m512i)__builtin_elementwise_abs((__v64qs)__A);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_abs_epi8(__A),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_abs_epi8(__A),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi16 (__m512i __A)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A);
-#else
- return (__m512i)__builtin_elementwise_abs((__v32hi)__A);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_abs_epi16(__A),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_abs_epi16(__A),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi32(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_packs_epi32(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_packs_epi32(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packs_epi16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_packs_epi16(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_packs_epi16(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi32(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_packus_epi32(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_packus_epi32(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_packus_epi16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_packus_epi16(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_packus_epi16(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epi8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B);
-#else
- return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_adds_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_adds_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epi16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B);
-#else
- return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_adds_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_adds_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epu8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B);
-#else
- return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_adds_epu8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_adds_epu8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_adds_epu16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B);
-#else
- return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_adds_epu16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_adds_epu16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_avg_epu8 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_avg_epu8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_avg_epu8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_avg_epu16 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_avg_epu16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_avg_epu16(__A, __B),
- (__v32hi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epi8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B);
-#else
- return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_max_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_max_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epi16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B);
-#else
- return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_max_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_max_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B);
-#else
- return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_max_epu8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_max_epu8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B);
-#else
- return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_max_epu16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_max_epu16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epi8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B);
-#else
- return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_min_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_min_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epi16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B);
-#else
- return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_min_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_min_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B);
-#else
- return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_min_epu8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_min_epu8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B);
-#else
- return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_min_epu16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_min_epu16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_shuffle_epi8(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_shuffle_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_shuffle_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epi8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B);
-#else
- return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_subs_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_subs_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epi16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B);
-#else
- return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_subs_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_subs_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epu8 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B);
-#else
- return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_subs_epu8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_subs_epu8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_subs_epu16 (__m512i __A, __m512i __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B);
-#else
- return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_subs_epu16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_subs_epu16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
- (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512(__U,
- (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
- (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512(__U,
- (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
- (__v32hi)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512(__U,
- (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhrs_epi16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mulhrs_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mulhrs_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhi_epi16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mulhi_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mulhi_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mulhi_epu16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mulhi_epu16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_mulhi_epu16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maddubs_epi16(__m512i __X, __m512i __Y) {
- return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X,
- __m512i __Y) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
- (__v32hi)_mm512_maddubs_epi16(__X, __Y),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U,
- (__v32hi)_mm512_maddubs_epi16(__X, __Y),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_madd_epi16(__m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_madd_epi16(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_madd_epi16(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi16_epi8 (__m512i __A) {
- return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
- (__v32qi)_mm256_setzero_si256(),
- (__mmask32) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
- (__v32qi)__O,
- __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
- (__v32qi) _mm256_setzero_si256(),
- __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi16_epi8 (__m512i __A) {
- return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
- (__v32qi) _mm256_setzero_si256(),
- (__mmask32) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
- (__v32qi) __O,
- __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
- (__v32qi) _mm256_setzero_si256(),
- __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_epi8 (__m512i __A) {
- return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
- (__v32qi) _mm256_undefined_si256(),
- (__mmask32) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
- (__v32qi) __O,
- __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) {
- return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
- (__v32qi) _mm256_setzero_si256(),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
-{
- __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
-{
- __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
-{
- __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi8(__m512i __A, __m512i __B) {
- return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
- 8, 64+8, 9, 64+9,
- 10, 64+10, 11, 64+11,
- 12, 64+12, 13, 64+13,
- 14, 64+14, 15, 64+15,
- 24, 64+24, 25, 64+25,
- 26, 64+26, 27, 64+27,
- 28, 64+28, 29, 64+29,
- 30, 64+30, 31, 64+31,
- 40, 64+40, 41, 64+41,
- 42, 64+42, 43, 64+43,
- 44, 64+44, 45, 64+45,
- 46, 64+46, 47, 64+47,
- 56, 64+56, 57, 64+57,
- 58, 64+58, 59, 64+59,
- 60, 64+60, 61, 64+61,
- 62, 64+62, 63, 64+63);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_unpackhi_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_unpackhi_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi16(__m512i __A, __m512i __B) {
- return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
- 4, 32+4, 5, 32+5,
- 6, 32+6, 7, 32+7,
- 12, 32+12, 13, 32+13,
- 14, 32+14, 15, 32+15,
- 20, 32+20, 21, 32+21,
- 22, 32+22, 23, 32+23,
- 28, 32+28, 29, 32+29,
- 30, 32+30, 31, 32+31);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_unpackhi_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_unpackhi_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi8(__m512i __A, __m512i __B) {
- return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B,
- 0, 64+0, 1, 64+1,
- 2, 64+2, 3, 64+3,
- 4, 64+4, 5, 64+5,
- 6, 64+6, 7, 64+7,
- 16, 64+16, 17, 64+17,
- 18, 64+18, 19, 64+19,
- 20, 64+20, 21, 64+21,
- 22, 64+22, 23, 64+23,
- 32, 64+32, 33, 64+33,
- 34, 64+34, 35, 64+35,
- 36, 64+36, 37, 64+37,
- 38, 64+38, 39, 64+39,
- 48, 64+48, 49, 64+49,
- 50, 64+50, 51, 64+51,
- 52, 64+52, 53, 64+53,
- 54, 64+54, 55, 64+55);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_unpacklo_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
- (__v64qi)_mm512_unpacklo_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi16(__m512i __A, __m512i __B) {
- return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B,
- 0, 32+0, 1, 32+1,
- 2, 32+2, 3, 32+3,
- 8, 32+8, 9, 32+9,
- 10, 32+10, 11, 32+11,
- 16, 32+16, 17, 32+17,
- 18, 32+18, 19, 32+19,
- 24, 32+24, 25, 32+25,
- 26, 32+26, 27, 32+27);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_unpacklo_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_unpacklo_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi8_epi16(__m256i __A)
-{
- /* This function always performs a signed extension, but __v32qi is a char
- which may be signed or unsigned, so use __v32qs. */
- return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_cvtepi8_epi16(__A),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_cvtepi8_epi16(__A),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu8_epi16(__m256i __A)
-{
- return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_cvtepu8_epi16(__A),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_cvtepu8_epi16(__A),
- (__v32hi)_mm512_setzero_si512());
-}
-
-
-#define _mm512_shufflehi_epi16(A, imm) \
- ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm)))
-
-#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shufflehi_epi16((A), \
- (imm)), \
- (__v32hi)(__m512i)(W)))
-
-#define _mm512_maskz_shufflehi_epi16(U, A, imm) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shufflehi_epi16((A), \
- (imm)), \
- (__v32hi)_mm512_setzero_si512()))
-
-#define _mm512_shufflelo_epi16(A, imm) \
- ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm)))
-
-
-#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shufflelo_epi16((A), \
- (imm)), \
- (__v32hi)(__m512i)(W)))
-
-
-#define _mm512_maskz_shufflelo_epi16(U, A, imm) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shufflelo_epi16((A), \
- (imm)), \
- (__v32hi)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sllv_epi16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sllv_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sllv_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sll_epi16(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sll_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sll_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi16(__m512i __A, unsigned int __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B);
-#else
- return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_slli_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_slli_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-#define _mm512_bslli_epi128(a, imm) \
- ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srlv_epi16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srlv_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srlv_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srav_epi16(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srav_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srav_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sra_epi16(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sra_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_sra_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi16(__m512i __A, unsigned int __B)
-{
-#if (__clang_major__ > 14)
- return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B);
-#else
- return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srai_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srai_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srl_epi16(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srl_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srl_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi16(__m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srli_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U,
- (__v32hi)_mm512_srli_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-#define _mm512_bsrli_epi128(a, imm) \
- ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
- (__v32hi) __A,
- (__v32hi) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U,
- (__v32hi) __A,
- (__v32hi) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
- (__v64qi) __A,
- (__v64qi) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U,
- (__v64qi) __A,
- (__v64qi) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
-{
- return (__m512i) __builtin_ia32_selectb_512(__M,
- (__v64qi)_mm512_set1_epi8(__A),
- (__v64qi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
-{
- return (__m512i) __builtin_ia32_selectb_512(__M,
- (__v64qi) _mm512_set1_epi8(__A),
- (__v64qi) _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS
-_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
-{
- return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
- (__mmask64) __B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS
-_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
-{
- return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
- (__mmask32) __B);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi16 (void const *__P)
-{
- struct __loadu_epi16 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi16*)__P)->__v;
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
- (__v32hi) __W,
- (__mmask32) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P,
- (__v32hi)
- _mm512_setzero_si512 (),
- (__mmask32) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi8 (void const *__P)
-{
- struct __loadu_epi8 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi8*)__P)->__v;
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
- (__v64qi) __W,
- (__mmask64) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P,
- (__v64qi)
- _mm512_setzero_si512 (),
- (__mmask64) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi16 (void *__P, __m512i __A)
-{
- struct __storeu_epi16 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi16*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
-{
- __builtin_ia32_storedquhi512_mask ((__v32hi *) __P,
- (__v32hi) __A,
- (__mmask32) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi8 (void *__P, __m512i __A)
-{
- struct __storeu_epi8 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi8*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
-{
- __builtin_ia32_storedquqi512_mask ((__v64qi *) __P,
- (__v64qi) __A,
- (__mmask64) __U);
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_test_epi8_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_test_epi16_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask64 __DEFAULT_FN_ATTRS512
-_mm512_movepi8_mask (__m512i __A)
-{
- return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS512
-_mm512_movepi16_mask (__m512i __A)
-{
- return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi8 (__mmask64 __A)
-{
- return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi16 (__mmask32 __A)
-{
- return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastb_epi8 (__m128i __A)
-{
- return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectb_512(__M,
- (__v64qi) _mm512_broadcastb_epi8(__A),
- (__v64qi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectb_512(__M,
- (__v64qi) _mm512_broadcastb_epi8(__A),
- (__v64qi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
-{
- return (__m512i) __builtin_ia32_selectw_512(__M,
- (__v32hi) _mm512_set1_epi16(__A),
- (__v32hi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
-{
- return (__m512i) __builtin_ia32_selectw_512(__M,
- (__v32hi) _mm512_set1_epi16(__A),
- (__v32hi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastw_epi16 (__m128i __A)
-{
- return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512(__M,
- (__v32hi) _mm512_broadcastw_epi16(__A),
- (__v32hi) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectw_512(__M,
- (__v32hi) _mm512_broadcastw_epi16(__A),
- (__v32hi) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_permutexvar_epi16(__A, __B),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M,
- (__v32hi)_mm512_permutexvar_epi16(__A, __B),
- (__v32hi)__W);
-}
-
-#define _mm512_alignr_epi8(A, B, N) \
- ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \
- (__v64qi)(__m512i)(B), (int)(N)))
-
-#define _mm512_mask_alignr_epi8(W, U, A, B, N) \
- ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
- (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
- (__v64qi)(__m512i)(W)))
-
-#define _mm512_maskz_alignr_epi8(U, A, B, N) \
- ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
- (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \
- (__v64qi)(__m512i)_mm512_setzero_si512()))
-
-#define _mm512_dbsad_epu8(A, B, imm) \
- ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \
- (__v64qi)(__m512i)(B), (int)(imm)))
-
-#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
- (__v32hi)(__m512i)(W)))
-
-#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \
- (__v32hi)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sad_epu8 (__m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
- (__v64qi) __B);
-}
-
-#undef __DEFAULT_FN_ATTRS512
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512CDINTRIN_H
-#define __AVX512CDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi64 (__m512i __A)
-{
- return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_conflict_epi64(__A),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_conflict_epi64(__A),
- (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_conflict_epi32 (__m512i __A)
-{
- return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_conflict_epi32(__A),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_conflict_epi32(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_lzcnt_epi32 (__m512i __A)
-{
- return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_lzcnt_epi32(__A),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_lzcnt_epi32(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_lzcnt_epi64 (__m512i __A)
-{
- return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_lzcnt_epi64(__A),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_lzcnt_epi64(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmb_epi64 (__mmask8 __A)
-{
- return (__m512i) _mm512_set1_epi64((long long) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_broadcastmw_epi32 (__mmask16 __A)
-{
- return (__m512i) _mm512_set1_epi32((int) __A);
-
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512DQINTRIN_H
-#define __AVX512DQINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq")))
-
-static __inline __mmask8 __DEFAULT_FN_ATTRS
-_knot_mask8(__mmask8 __M)
-{
- return __builtin_ia32_knotqi(__M);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kand_mask8(__mmask8 __A, __mmask8 __B)
-{
- return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kandn_mask8(__mmask8 __A, __mmask8 __B)
-{
- return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kor_mask8(__mmask8 __A, __mmask8 __B)
-{
- return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kxnor_mask8(__mmask8 __A, __mmask8 __B)
-{
- return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kxor_mask8(__mmask8 __A, __mmask8 __B)
-{
- return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
- return (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
- return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B);
- return (unsigned char)__builtin_ia32_kortestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
- return (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B)
-{
- return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B);
- return (unsigned char)__builtin_ia32_ktestzqi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
- return (unsigned char)__builtin_ia32_ktestchi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
- return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B);
- return (unsigned char)__builtin_ia32_ktestzhi(__A, __B);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_kadd_mask8(__mmask8 __A, __mmask8 __B)
-{
- return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_kadd_mask16(__mmask16 __A, __mmask16 __B)
-{
- return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B);
-}
-
-#define _kshiftli_mask8(A, I) \
- ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask8(A, I) \
- ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_cvtmask8_u32(__mmask8 __A) {
- return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_cvtu32_mask8(unsigned int __A) {
- return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS
-_load_mask8(__mmask8 *__A) {
- return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask8(__mmask8 *__A, __mmask8 __B) {
- *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullo_epi64 (__m512i __A, __m512i __B) {
- return (__m512i) ((__v8du) __A * (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_mullo_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_mullo_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_xor_pd(__m512d __A, __m512d __B) {
- return (__m512d)((__v8du)__A ^ (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_xor_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_xor_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_xor_ps (__m512 __A, __m512 __B) {
- return (__m512)((__v16su)__A ^ (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_xor_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_xor_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_or_pd(__m512d __A, __m512d __B) {
- return (__m512d)((__v8du)__A | (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_or_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_or_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_or_ps(__m512 __A, __m512 __B) {
- return (__m512)((__v16su)__A | (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_or_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_or_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_and_pd(__m512d __A, __m512d __B) {
- return (__m512d)((__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_and_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_and_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_and_ps(__m512 __A, __m512 __B) {
- return (__m512)((__v16su)__A & (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_and_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_and_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_andnot_pd(__m512d __A, __m512d __B) {
- return (__m512d)(~(__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_andnot_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_andnot_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_andnot_ps(__m512 __A, __m512 __B) {
- return (__m512)(~(__v16su)__A & (__v16su)__B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_andnot_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_andnot_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epi64 (__m512d __A) {
- return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epi64(A, R) \
- ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \
- ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epu64 (__m512d __A) {
- return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epu64(A, R) \
- ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \
- ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epi64 (__m256 __A) {
- return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epi64(A, R) \
- ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epu64 (__m256 __A) {
- return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epu64(A, R) \
- ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_pd (__m512i __A) {
- return (__m512d)__builtin_convertvector((__v8di)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_cvtepi64_pd(__A),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_cvtepi64_pd(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_cvt_roundepi64_pd(A, R) \
- ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \
- ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \
- ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_ps (__m512i __A) {
- return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
- (__v8sf) _mm256_setzero_ps(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
- return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
- (__v8sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) {
- return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
- (__v8sf) _mm256_setzero_ps(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi64_ps(A, R) \
- ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \
- ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
- (__v8sf)(__m256)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \
- ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epi64 (__m512d __A) {
- return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundpd_epi64(A, R) \
- ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \
- ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epu64 (__m512d __A) {
- return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) {
- return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundpd_epu64(A, R) \
- ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \
- ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epi64 (__m256 __A) {
- return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundps_epi64(A, R) \
- ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epu64 (__m256 __A) {
- return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
- (__v8di) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) {
- return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundps_epu64(A, R) \
- ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
- (__v8di)(__m512i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \
- (__v8di)_mm512_setzero_si512(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepu64_pd (__m512i __A) {
- return (__m512d)__builtin_convertvector((__v8du)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_cvtepu64_pd(__A),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_cvtepu64_pd(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_cvt_roundepu64_pd(A, R) \
- ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \
- ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \
- ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_cvtepu64_ps (__m512i __A) {
- return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
- (__v8sf) _mm256_setzero_ps(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) {
- return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
- (__v8sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) {
- return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
- (__v8sf) _mm256_setzero_ps(),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu64_ps(A, R) \
- ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \
- ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
- (__v8sf)(__m256)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \
- ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_range_pd(A, B, C) \
- ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_range_pd(W, U, A, B, C) \
- ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)(__m512d)(W), (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_range_pd(U, A, B, C) \
- ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_range_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
- ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)(__m512d)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(C), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_range_ps(A, B, C) \
- ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_range_ps(W, U, A, B, C) \
- ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)(__m512)(W), (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_range_ps(U, A, B, C) \
- ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_range_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
- ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)(__m512)(W), (__mmask16)(U), \
- (int)(R)))
-
-#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(C), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm_range_round_ss(A, B, C, R) \
- ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8) -1, (int)(C),\
- (int)(R)))
-
-#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
- ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W),\
- (__mmask8)(U), (int)(C),\
- (int)(R)))
-
-#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_range_round_ss(U, A, B, C, R) \
- ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(C),\
- (int)(R)))
-
-#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_range_round_sd(A, B, C, R) \
- ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8) -1, (int)(C),\
- (int)(R)))
-
-#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
- ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W),\
- (__mmask8)(U), (int)(C),\
- (int)(R)))
-
-#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_range_round_sd(U, A, B, C, R) \
- ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(C),\
- (int)(R)))
-
-#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_reduce_pd(A, B) \
- ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_pd(W, U, A, B) \
- ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_reduce_pd(U, A, B) \
- ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_reduce_ps(A, B) \
- ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_ps(W, U, A, B) \
- ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_reduce_ps(U, A, B) \
- ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_reduce_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_reduce_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm_reduce_ss(A, B, C) \
- ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
- (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_reduce_ss(W, U, A, B, C) \
- ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_reduce_ss(U, A, B, C) \
- ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(C), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_reduce_round_ss(A, B, C, R) \
- ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \
- (int)(C), (int)(R)))
-
-#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
- ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(C), (int)(R)))
-
-#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
- ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(C), (int)(R)))
-
-#define _mm_reduce_sd(A, B, C) \
- ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(C), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_reduce_sd(W, U, A, B, C) \
- ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
- (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_reduce_sd(U, A, B, C) \
- ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(C), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_reduce_round_sd(A, B, C, R) \
- ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(C), (int)(R)))
-
-#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
- ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), (__mmask8)(U), \
- (int)(C), (int)(R)))
-
-#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
- ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(C), (int)(R)))
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_movepi32_mask (__m512i __A)
-{
- return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi32 (__mmask16 __A)
-{
- return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_movm_epi64 (__mmask8 __A)
-{
- return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_movepi64_mask (__m512i __A)
-{
- return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
-}
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x2 (__m128 __A)
-{
- return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
- (__v16sf)_mm512_broadcast_f32x2(__A),
- (__v16sf)__O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
- (__v16sf)_mm512_broadcast_f32x2(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x8(__m256 __A)
-{
- return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A,
- 0, 1, 2, 3, 4, 5, 6, 7,
- 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
- (__v16sf)_mm512_broadcast_f32x8(__A),
- (__v16sf)__O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
- (__v16sf)_mm512_broadcast_f32x8(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f64x2(__m128d __A)
-{
- return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
- 0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
- (__v8df)_mm512_broadcast_f64x2(__A),
- (__v8df)__O);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
- (__v8df)_mm512_broadcast_f64x2(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x2 (__m128i __A)
-{
- return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
- 0, 1, 0, 1, 0, 1, 0, 1,
- 0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_broadcast_i32x2(__A),
- (__v16si)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_broadcast_i32x2(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x8(__m256i __A)
-{
- return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A,
- 0, 1, 2, 3, 4, 5, 6, 7,
- 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_broadcast_i32x8(__A),
- (__v16si)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_broadcast_i32x8(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i64x2(__m128i __A)
-{
- return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
- 0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_broadcast_i64x2(__A),
- (__v8di)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_broadcast_i64x2(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_extractf32x8_ps(A, imm) \
- ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v8sf)_mm256_undefined_ps(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
- ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v8sf)(__m256)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extractf32x8_ps(U, A, imm) \
- ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm512_extractf64x2_pd(A, imm) \
- ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
- (int)(imm), \
- (__v2df)_mm_undefined_pd(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
- ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
- (int)(imm), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extractf64x2_pd(U, A, imm) \
- ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm512_extracti32x8_epi32(A, imm) \
- ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v8si)_mm256_undefined_si256(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
- ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \
- ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U)))
-
-#define _mm512_extracti64x2_epi64(A, imm) \
- ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
- (int)(imm), \
- (__v2di)_mm_undefined_si128(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
- ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
- (int)(imm), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \
- ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
- (int)(imm), \
- (__v2di)_mm_setzero_si128(), \
- (__mmask8)(U)))
-
-#define _mm512_insertf32x8(A, B, imm) \
- ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \
- (__v8sf)(__m256)(B), (int)(imm)))
-
-#define _mm512_mask_insertf32x8(W, U, A, B, imm) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_insertf32x8(U, A, B, imm) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \
- (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_insertf64x2(A, B, imm) \
- ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \
- (__v2df)(__m128d)(B), (int)(imm)))
-
-#define _mm512_mask_insertf64x2(W, U, A, B, imm) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_insertf64x2(U, A, B, imm) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_insertf64x2((A), (B), (imm)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_inserti32x8(A, B, imm) \
- ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \
- (__v8si)(__m256i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti32x8(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
- (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_inserti32x8(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_inserti32x8((A), (B), (imm)), \
- (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_inserti64x2(A, B, imm) \
- ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \
- (__v2di)(__m128i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti64x2(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
- (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_inserti64x2(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_inserti64x2((A), (B), (imm)), \
- (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_mask_fpclass_ps_mask(U, A, imm) \
- ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
- (int)(imm), (__mmask16)(U)))
-
-#define _mm512_fpclass_ps_mask(A, imm) \
- ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \
- (int)(imm), (__mmask16)-1))
-
-#define _mm512_mask_fpclass_pd_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm512_fpclass_pd_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_fpclass_sd_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_fpclass_sd_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_fpclass_ss_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_fpclass_ss_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#undef __DEFAULT_FN_ATTRS512
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512ERINTRIN_H
-#define __AVX512ERINTRIN_H
-
-/* exp2a23 */
-#define _mm512_exp2a23_round_pd(A, R) \
- ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \
- ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(S), (__mmask8)(M), \
- (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_pd(M, A, R) \
- ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm512_exp2a23_pd(A) \
- _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_pd(S, M, A) \
- _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_pd(M, A) \
- _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_exp2a23_round_ps(A, R) \
- ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \
- ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(S), (__mmask16)(M), \
- (int)(R)))
-
-#define _mm512_maskz_exp2a23_round_ps(M, A, R) \
- ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(M), (int)(R)))
-
-#define _mm512_exp2a23_ps(A) \
- _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_exp2a23_ps(S, M, A) \
- _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_exp2a23_ps(M, A) \
- _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-/* rsqrt28 */
-#define _mm512_rsqrt28_round_pd(A, R) \
- ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \
- ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(S), (__mmask8)(M), \
- (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \
- ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm512_rsqrt28_pd(A) \
- _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_pd(S, M, A) \
- _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_pd(M, A) \
- _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rsqrt28_round_ps(A, R) \
- ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \
- ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(S), (__mmask16)(M), \
- (int)(R)))
-
-#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \
- ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(M), (int)(R)))
-
-#define _mm512_rsqrt28_ps(A) \
- _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rsqrt28_ps(S, M, A) \
- _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rsqrt28_ps(M, A) \
- _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \
- ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(S), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \
- ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_ss(A, B) \
- _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_ss(S, M, A, B) \
- _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_ss(M, A, B) \
- _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rsqrt28_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \
- ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(S), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \
- ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_rsqrt28_sd(A, B) \
- _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rsqrt28_sd(S, M, A, B) \
- _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rsqrt28_sd(M, A, B) \
- _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-/* rcp28 */
-#define _mm512_rcp28_round_pd(A, R) \
- ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_pd(S, M, A, R) \
- ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(S), (__mmask8)(M), \
- (int)(R)))
-
-#define _mm512_maskz_rcp28_round_pd(M, A, R) \
- ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm512_rcp28_pd(A) \
- _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_pd(S, M, A) \
- _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_pd(M, A) \
- _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_rcp28_round_ps(A, R) \
- ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_rcp28_round_ps(S, M, A, R) \
- ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(S), (__mmask16)(M), \
- (int)(R)))
-
-#define _mm512_maskz_rcp28_round_ps(M, A, R) \
- ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(M), (int)(R)))
-
-#define _mm512_rcp28_ps(A) \
- _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_rcp28_ps(S, M, A) \
- _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_maskz_rcp28_ps(M, A) \
- _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \
- ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(S), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_ss(M, A, B, R) \
- ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_ss(A, B) \
- _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_ss(S, M, A, B) \
- _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_ss(M, A, B) \
- _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_rcp28_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \
- ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(S), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_maskz_rcp28_round_sd(M, A, B, R) \
- ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_rcp28_sd(A, B) \
- _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_mask_rcp28_sd(S, M, A, B) \
- _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_maskz_rcp28_sd(M, A, B) \
- _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION)
-
-#endif /* __AVX512ERINTRIN_H */
+++ /dev/null
-/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512FINTRIN_H
-#define __AVX512FINTRIN_H
-
-typedef char __v64qi __attribute__((__vector_size__(64)));
-typedef short __v32hi __attribute__((__vector_size__(64)));
-typedef double __v8df __attribute__((__vector_size__(64)));
-typedef float __v16sf __attribute__((__vector_size__(64)));
-typedef long long __v8di __attribute__((__vector_size__(64)));
-typedef int __v16si __attribute__((__vector_size__(64)));
-
-/* Unsigned types */
-typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
-typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
-typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
-typedef unsigned int __v16su __attribute__((__vector_size__(64)));
-
-/* We need an explicitly signed variant for char. Note that this shouldn't
- * appear in the interface though. */
-typedef signed char __v64qs __attribute__((__vector_size__(64)));
-
-typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64)));
-typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64)));
-typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64)));
-
-typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1)));
-typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1)));
-typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1)));
-
-typedef unsigned char __mmask8;
-typedef unsigned short __mmask16;
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-
-/* Constants for integer comparison predicates */
-typedef enum {
- _MM_CMPINT_EQ, /* Equal */
- _MM_CMPINT_LT, /* Less than */
- _MM_CMPINT_LE, /* Less than or Equal */
- _MM_CMPINT_UNUSED,
- _MM_CMPINT_NE, /* Not Equal */
- _MM_CMPINT_NLT, /* Not Less than */
-#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */
- _MM_CMPINT_NLE /* Not Less than or Equal */
-#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */
-} _MM_CMPINT_ENUM;
-
-typedef enum
-{
- _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
- _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
- _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
- _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
- _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
- _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
- _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
- _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
- _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
- _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
- _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
- _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
- _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
- _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
- _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
- _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
- _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
- _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
- _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
- _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
- _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
- _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
- _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
- _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
- _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
- _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
- _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
- _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
- _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
- _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
- _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
- _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
- _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
- _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
- _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
- _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
- _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
- _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
- _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
- _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
- _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
- _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
- _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
- _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
- _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
- _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
- _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
- _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
- _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
- _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
- _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
- _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
- _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
- _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
- _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
- _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
- _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
- _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
- _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
- _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
- _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
- _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
- _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
- _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
- _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
- _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
- _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
- _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
- _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
- _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
- _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
- _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
- _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
- _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
- _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
- _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
- _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
- _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
- _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
- _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
- _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
- _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
- _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
- _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
- _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
- _MM_PERM_DDDD = 0xFF
-} _MM_PERM_ENUM;
-
-typedef enum
-{
- _MM_MANT_NORM_1_2, /* interval [1, 2) */
- _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */
- _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */
- _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */
-} _MM_MANTISSA_NORM_ENUM;
-
-typedef enum
-{
- _MM_MANT_SIGN_src, /* sign = sign(SRC) */
- _MM_MANT_SIGN_zero, /* sign = 0 */
- _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */
-} _MM_MANTISSA_SIGN_ENUM;
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))
-
-/* Create vectors with repeated elements */
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_setzero_si512(void)
-{
- return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
-}
-
-#define _mm512_setzero_epi32 _mm512_setzero_si512
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_undefined_pd(void)
-{
- return (__m512d)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_undefined(void)
-{
- return (__m512)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_undefined_ps(void)
-{
- return (__m512)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_undefined_epi32(void)
-{
- return (__m512i)__builtin_ia32_undef512();
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastd_epi32 (__m128i __A)
-{
- return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512(__M,
- (__v16si) _mm512_broadcastd_epi32(__A),
- (__v16si) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512(__M,
- (__v16si) _mm512_broadcastd_epi32(__A),
- (__v16si) _mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcastq_epi64 (__m128i __A)
-{
- return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A,
- 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di) _mm512_broadcastq_epi64(__A),
- (__v8di) __O);
-
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di) _mm512_broadcastq_epi64(__A),
- (__v8di) _mm512_setzero_si512());
-}
-
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_setzero_ps(void)
-{
- return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
-}
-
-#define _mm512_setzero _mm512_setzero_ps
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_setzero_pd(void)
-{
- return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_set1_ps(float __w)
-{
- return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w };
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_set1_pd(double __w)
-{
- return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi8(char __w)
-{
- return __extension__ (__m512i)(__v64qi){
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi16(short __w)
-{
- return __extension__ (__m512i)(__v32hi){
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi32(int __s)
-{
- return __extension__ (__m512i)(__v16si){
- __s, __s, __s, __s, __s, __s, __s, __s,
- __s, __s, __s, __s, __s, __s, __s, __s };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
-{
- return (__m512i)__builtin_ia32_selectd_512(__M,
- (__v16si)_mm512_set1_epi32(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set1_epi64(long long __d)
-{
- return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_set1_epi64(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcastss_ps(__m128 __A)
-{
- return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A,
- 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
-{
- return __extension__ (__m512i)(__v16si)
- { __D, __C, __B, __A, __D, __C, __B, __A,
- __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set4_epi64 (long long __A, long long __B, long long __C,
- long long __D)
-{
- return __extension__ (__m512i) (__v8di)
- { __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_set4_pd (double __A, double __B, double __C, double __D)
-{
- return __extension__ (__m512d)
- { __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_set4_ps (float __A, float __B, float __C, float __D)
-{
- return __extension__ (__m512)
- { __D, __C, __B, __A, __D, __C, __B, __A,
- __D, __C, __B, __A, __D, __C, __B, __A };
-}
-
-#define _mm512_setr4_epi32(e0,e1,e2,e3) \
- _mm512_set4_epi32((e3),(e2),(e1),(e0))
-
-#define _mm512_setr4_epi64(e0,e1,e2,e3) \
- _mm512_set4_epi64((e3),(e2),(e1),(e0))
-
-#define _mm512_setr4_pd(e0,e1,e2,e3) \
- _mm512_set4_pd((e3),(e2),(e1),(e0))
-
-#define _mm512_setr4_ps(e0,e1,e2,e3) \
- _mm512_set4_ps((e3),(e2),(e1),(e0))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcastsd_pd(__m128d __A)
-{
- return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A,
- 0, 0, 0, 0, 0, 0, 0, 0);
-}
-
-/* Cast between vector types */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_castpd256_pd512(__m256d __a)
-{
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_castps256_ps512(__m256 __a)
-{
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
- -1, -1, -1, -1, -1, -1, -1, -1);
-}
-
-static __inline __m128d __DEFAULT_FN_ATTRS512
-_mm512_castpd512_pd128(__m512d __a)
-{
- return __builtin_shufflevector(__a, __a, 0, 1);
-}
-
-static __inline __m256d __DEFAULT_FN_ATTRS512
-_mm512_castpd512_pd256 (__m512d __A)
-{
- return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
-}
-
-static __inline __m128 __DEFAULT_FN_ATTRS512
-_mm512_castps512_ps128(__m512 __a)
-{
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
-}
-
-static __inline __m256 __DEFAULT_FN_ATTRS512
-_mm512_castps512_ps256 (__m512 __A)
-{
- return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_castpd_ps (__m512d __A)
-{
- return (__m512) (__A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_castpd_si512 (__m512d __A)
-{
- return (__m512i) (__A);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_castpd128_pd512 (__m128d __A)
-{
- return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_castps_pd (__m512 __A)
-{
- return (__m512d) (__A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_castps_si512 (__m512 __A)
-{
- return (__m512i) (__A);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_castps128_ps512 (__m128 __A)
-{
- return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_castsi128_si512 (__m128i __A)
-{
- return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_castsi256_si512 (__m256i __A)
-{
- return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_castsi512_ps (__m512i __A)
-{
- return (__m512) (__A);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_castsi512_pd (__m512i __A)
-{
- return (__m512d) (__A);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS512
-_mm512_castsi512_si128 (__m512i __A)
-{
- return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS512
-_mm512_castsi512_si256 (__m512i __A)
-{
- return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_int2mask(int __a)
-{
- return (__mmask16)__a;
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_mask2int(__mmask16 __a)
-{
- return (int)__a;
-}
-
-/// Constructs a 512-bit floating-point vector of [8 x double] from a
-/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
-/// contain the value of the source vector. The upper 384 bits are set
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
-/// contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_zextpd128_pd512(__m128d __a)
-{
- return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
-}
-
-/// Constructs a 512-bit floating-point vector of [8 x double] from a
-/// 256-bit floating-point vector of [4 x double]. The lower 256 bits
-/// contain the value of the source vector. The upper 256 bits are set
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
-/// contain the value of the parameter. The upper 256 bits are set to zero.
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_zextpd256_pd512(__m256d __a)
-{
- return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/// Constructs a 512-bit floating-point vector of [16 x float] from a
-/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
-/// the value of the source vector. The upper 384 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
-/// contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_zextps128_ps512(__m128 __a)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
-}
-
-/// Constructs a 512-bit floating-point vector of [16 x float] from a
-/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain
-/// the value of the source vector. The upper 256 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
-/// contain the value of the parameter. The upper 256 bits are set to zero.
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_zextps256_ps512(__m256 __a)
-{
- return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-/// Constructs a 512-bit integer vector from a 128-bit integer vector.
-/// The lower 128 bits contain the value of the source vector. The upper
-/// 384 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
-/// the parameter. The upper 384 bits are set to zero.
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_zextsi128_si512(__m128i __a)
-{
- return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
-}
-
-/// Constructs a 512-bit integer vector from a 256-bit integer vector.
-/// The lower 256 bits contain the value of the source vector. The upper
-/// 256 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
-/// the parameter. The upper 256 bits are set to zero.
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_zextsi256_si512(__m256i __a)
-{
- return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/* Bitwise operators */
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_and_epi32(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v16su)__a & (__v16su)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
- (__v16si) _mm512_and_epi32(__a, __b),
- (__v16si) __src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
-{
- return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
- __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_and_epi64(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v8du)__a & (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
-{
- return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
- (__v8di) _mm512_and_epi64(__a, __b),
- (__v8di) __src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
-{
- return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
- __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_andnot_si512 (__m512i __A, __m512i __B)
-{
- return (__m512i)(~(__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_andnot_epi32 (__m512i __A, __m512i __B)
-{
- return (__m512i)(~(__v16su)__A & (__v16su)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_andnot_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
- __U, __A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_andnot_epi64(__m512i __A, __m512i __B)
-{
- return (__m512i)(~(__v8du)__A & (__v8du)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_andnot_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
- __U, __A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_or_epi32(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v16su)__a | (__v16su)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
- (__v16si)_mm512_or_epi32(__a, __b),
- (__v16si)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_or_epi64(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v8du)__a | (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
- (__v8di)_mm512_or_epi64(__a, __b),
- (__v8di)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_xor_epi32(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v16su)__a ^ (__v16su)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
- (__v16si)_mm512_xor_epi32(__a, __b),
- (__v16si)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_xor_epi64(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v8du)__a ^ (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
- (__v8di)_mm512_xor_epi64(__a, __b),
- (__v8di)__src);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
-{
- return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_and_si512(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v8du)__a & (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_or_si512(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v8du)__a | (__v8du)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_xor_si512(__m512i __a, __m512i __b)
-{
- return (__m512i)((__v8du)__a ^ (__v8du)__b);
-}
-
-/* Arithmetic */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_add_pd(__m512d __a, __m512d __b)
-{
- return (__m512d)((__v8df)__a + (__v8df)__b);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_add_ps(__m512 __a, __m512 __b)
-{
- return (__m512)((__v16sf)__a + (__v16sf)__b);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mul_pd(__m512d __a, __m512d __b)
-{
- return (__m512d)((__v8df)__a * (__v8df)__b);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mul_ps(__m512 __a, __m512 __b)
-{
- return (__m512)((__v16sf)__a * (__v16sf)__b);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_sub_pd(__m512d __a, __m512d __b)
-{
- return (__m512d)((__v8df)__a - (__v8df)__b);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_sub_ps(__m512 __a, __m512 __b)
-{
- return (__m512)((__v16sf)__a - (__v16sf)__b);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi64 (__m512i __A, __m512i __B)
-{
- return (__m512i) ((__v8du) __A + (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_add_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_add_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi64 (__m512i __A, __m512i __B)
-{
- return (__m512i) ((__v8du) __A - (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sub_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sub_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_add_epi32 (__m512i __A, __m512i __B)
-{
- return (__m512i) ((__v16su) __A + (__v16su) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_add_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_add_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sub_epi32 (__m512i __A, __m512i __B)
-{
- return (__m512i) ((__v16su) __A - (__v16su) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sub_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sub_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-#define _mm512_max_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_max_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_max_round_pd((A), (B), (R)), \
- (__v8df)(W)))
-
-#define _mm512_maskz_max_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_max_round_pd((A), (B), (R)), \
- (__v8df)_mm512_setzero_pd()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_max_pd(__m512d __A, __m512d __B)
-{
- return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_max_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_max_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_max_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_max_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
- (__v16sf)(W)))
-
-#define _mm512_maskz_max_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_max_round_ps((A), (B), (R)), \
- (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_max_ps(__m512 __A, __m512 __B)
-{
- return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_max_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_max_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) _mm_setzero_ps (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_max_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_max_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm_maskz_max_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) _mm_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_max_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_max_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_max_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline __m512i
-__DEFAULT_FN_ATTRS512
-_mm512_max_epi32(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B);
-#else
- return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_max_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_max_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu32(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B);
-#else
- return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_max_epu32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_max_epu32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epi64(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B);
-#else
- return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_max_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_max_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_max_epu64(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B);
-#else
- return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_max_epu64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_max_epu64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_min_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_min_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_min_round_pd((A), (B), (R)), \
- (__v8df)(W)))
-
-#define _mm512_maskz_min_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_min_round_pd((A), (B), (R)), \
- (__v8df)_mm512_setzero_pd()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_min_pd(__m512d __A, __m512d __B)
-{
- return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_min_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_min_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_min_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_min_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
- (__v16sf)(W)))
-
-#define _mm512_maskz_min_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_min_round_ps((A), (B), (R)), \
- (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_min_ps(__m512 __A, __m512 __B)
-{
- return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_min_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_min_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) _mm_setzero_ps (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_min_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_min_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm_maskz_min_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) _mm_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_min_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_min_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_min_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline __m512i
-__DEFAULT_FN_ATTRS512
-_mm512_min_epi32(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B);
-#else
- return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_min_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_min_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu32(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B);
-#else
- return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_min_epu32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_min_epu32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epi64(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B);
-#else
- return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_min_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_min_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_min_epu64(__m512i __A, __m512i __B)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B);
-#else
- return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_min_epu64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_min_epu64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epi32(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_mul_epi32(__X, __Y),
- (__v8di)__W);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_mul_epi32(__X, __Y),
- (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mul_epu32(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_mul_epu32(__X, __Y),
- (__v8di)__W);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_mul_epu32(__X, __Y),
- (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullo_epi32 (__m512i __A, __m512i __B)
-{
- return (__m512i) ((__v16su) __A * (__v16su) __B);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_mullo_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_mullo_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mullox_epi64 (__m512i __A, __m512i __B) {
- return (__m512i) ((__v8du) __A * (__v8du) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) {
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_mullox_epi64(__A, __B),
- (__v8di)__W);
-}
-
-#define _mm512_sqrt_round_pd(A, R) \
- ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R)))
-
-#define _mm512_mask_sqrt_round_pd(W, U, A, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_sqrt_round_pd((A), (R)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_sqrt_round_pd(U, A, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_sqrt_round_pd((A), (R)), \
- (__v8df)_mm512_setzero_pd()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_sqrt_pd(__m512d __A)
-{
- return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_sqrt_pd(__A),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_sqrt_pd(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_sqrt_round_ps(A, R) \
- ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R)))
-
-#define _mm512_mask_sqrt_round_ps(W, U, A, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_sqrt_round_ps(U, A, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_sqrt_round_ps((A), (R)), \
- (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_sqrt_ps(__m512 __A)
-{
- return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_sqrt_ps(__A),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_sqrt_ps(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_rsqrt14_pd(__m512d __A)
-{
- return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) -1);}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
- (__v8df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_rsqrt14_ps(__m512 __A)
-{
- return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) -1);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
- (__v16sf) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_ss(__m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_sd(__m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_rcp14_pd(__m512d __A)
-{
- return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
- (__v8df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_rcp14_ps(__m512 __A)
-{
- return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) -1);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
- (__v16sf) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rcp14_ss(__m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rcp14_sd(__m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_floor_ps(__m512 __A)
-{
- return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
- _MM_FROUND_FLOOR,
- (__v16sf) __A, -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
- _MM_FROUND_FLOOR,
- (__v16sf) __W, __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_floor_pd(__m512d __A)
-{
- return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
- _MM_FROUND_FLOOR,
- (__v8df) __A, -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
- _MM_FROUND_FLOOR,
- (__v8df) __W, __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
- _MM_FROUND_CEIL,
- (__v16sf) __W, __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_ceil_ps(__m512 __A)
-{
- return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
- _MM_FROUND_CEIL,
- (__v16sf) __A, -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_ceil_pd(__m512d __A)
-{
- return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
- _MM_FROUND_CEIL,
- (__v8df) __A, -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
- _MM_FROUND_CEIL,
- (__v8df) __W, __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi64(__m512i __A)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pabsq512((__v8di)__A);
-#else
- return (__m512i)__builtin_elementwise_abs((__v8di)__A);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_abs_epi64(__A),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_abs_epi64(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_abs_epi32(__m512i __A)
-{
-#if (__clang_major__ < 14)
- return (__m512i)__builtin_ia32_pabsd512((__v16si) __A);
-#else
- return (__m512i)__builtin_elementwise_abs((__v16si) __A);
-#endif
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_abs_epi32(__A),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_abs_epi32(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_add_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_add_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-
-#define _mm_add_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_add_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm_maskz_add_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_add_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_add_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-#define _mm_add_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_add_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_add_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_add_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_add_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_add_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_add_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_add_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_add_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_add_round_pd((A), (B), (R)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_add_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_add_round_pd((A), (B), (R)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_add_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_add_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_add_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_add_round_ps((A), (B), (R)), \
- (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_sub_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_sub_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-#define _mm_sub_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sub_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm_maskz_sub_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_sub_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_sub_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-
-#define _mm_sub_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sub_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_sub_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_sub_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_sub_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_sub_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_sub_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_sub_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_sub_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_sub_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_sub_round_pd((A), (B), (R)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_sub_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_sub_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_sub_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \
- (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_mul_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_mul_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-#define _mm_mul_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_mul_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm_maskz_mul_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_mul_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_mul_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-
-#define _mm_mul_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_mul_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_mul_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_mul_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_mul_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_mul_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_mul_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_mul_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_mul_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_mul_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_mul_round_pd((A), (B), (R)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_mul_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_mul_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_mul_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \
- (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_div_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
- __A = _mm_div_ss(__A, __B);
- return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps());
-}
-
-#define _mm_div_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_div_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm_maskz_div_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_div_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
- __A = _mm_div_sd(__A, __B);
- return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd());
-}
-
-#define _mm_div_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_div_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_div_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_div_pd(__m512d __a, __m512d __b)
-{
- return (__m512d)((__v8df)__a/(__v8df)__b);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_div_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_div_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_div_ps(__m512 __a, __m512 __b)
-{
- return (__m512)((__v16sf)__a/(__v16sf)__b);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_div_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_div_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-#define _mm512_div_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(R)))
-
-#define _mm512_mask_div_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_div_round_pd((A), (B), (R)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_div_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_div_round_pd((A), (B), (R)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_div_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(R)))
-
-#define _mm512_mask_div_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_div_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_div_round_ps((A), (B), (R)), \
- (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_roundscale_ps(A, B) \
- ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_ps(A, B, C, imm) \
- ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
- (__v16sf)(__m512)(A), (__mmask16)(B), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_roundscale_ps(A, B, imm) \
- ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(A), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \
- ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
- (__v16sf)(__m512)(A), (__mmask16)(B), \
- (int)(R)))
-
-#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \
- ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(A), (int)(R)))
-
-#define _mm512_roundscale_round_ps(A, imm, R) \
- ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_roundscale_pd(A, B) \
- ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_pd(A, B, C, imm) \
- ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
- (__v8df)(__m512d)(A), (__mmask8)(B), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_roundscale_pd(A, B, imm) \
- ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(A), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \
- ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
- (__v8df)(__m512d)(A), (__mmask8)(B), \
- (int)(R)))
-
-#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \
- ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(A), (int)(R)))
-
-#define _mm512_roundscale_round_pd(A, imm, R) \
- ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_fmadd_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fmsub_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fnmadd_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fnmsub_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- (__v8df) __B,
- -(__v8df) __C,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- (__v8df) __B,
- -(__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
- (__v8df) __B,
- -(__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- -(__v8df) __B,
- (__v8df) __C,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- -(__v8df) __B,
- -(__v8df) __C,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
- (__v8df) __B,
- -(__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmadd_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fmsub_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fnmadd_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- -(__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fnmsub_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- -(__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
- (__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- -(__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- -(__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
- (__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmaddsub_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
- ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_fmsubadd_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
- return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
- (__v8df) __B,
- -(__v8df) __C,
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
- (__v8df) __B,
- -(__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
- (__v8df) __B,
- -(__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmaddsub_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
- ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_fmsubadd_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)-1, (int)(R)))
-
-
-#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
- return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
- (__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
- ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
- return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
- ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
- return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
- ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
- return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
- ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
- return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
- -(__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- -(__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- -(__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- -(__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
- ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
- -(__v8df)(__m512d)(B), \
- -(__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
- ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(C), \
- (__mmask8)(U), (int)(R)))
-
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
-{
- return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
- -(__v8df) __B,
- -(__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
-{
- return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A,
- (__v8df) __B,
- (__v8df) __C,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
- ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
- -(__v16sf)(__m512)(B), \
- -(__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
- ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(C), \
- (__mmask16)(U), (int)(R)))
-
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
-{
- return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
- -(__v16sf) __B,
- -(__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
-{
- return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __C,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-
-
-/* Vector permutations */
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I,
- (__v16si) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
- (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
- (__v16si)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_permutex2var_epi32(__A, __I, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I,
- (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
- (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
- (__v8di)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_permutex2var_epi64(__A, __I, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_alignr_epi64(A, B, I) \
- ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
- (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_alignr_epi64(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
- (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_alignr_epi32(A, B, I) \
- ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
- (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_alignr_epi32(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
- (__v16si)_mm512_setzero_si512()))
-/* Vector Extract */
-
-#define _mm512_extractf64x4_pd(A, I) \
- ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
- (__v4df)_mm256_undefined_pd(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
- ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extractf64x4_pd(U, A, imm) \
- ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm512_extractf32x4_ps(A, I) \
- ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
- (__v4sf)_mm_undefined_ps(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
- ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extractf32x4_ps(U, A, imm) \
- ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)))
-
-/* Vector Blend */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
-{
- return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
- (__v8df) __W,
- (__v8df) __A);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
-{
- return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
- (__v16sf) __W,
- (__v16sf) __A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
-{
- return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
- (__v8di) __W,
- (__v8di) __A);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
-{
- return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
- (__v16si) __W,
- (__v16si) __A);
-}
-
-/* Compare */
-
-#define _mm512_cmp_round_ps_mask(A, B, P, R) \
- ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(P), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \
- ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(P), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_cmp_ps_mask(A, B, P) \
- _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
- _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_cmpeq_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
-#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)
-
-#define _mm512_cmplt_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
-#define _mm512_mask_cmplt_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)
-
-#define _mm512_cmple_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
-#define _mm512_mask_cmple_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)
-
-#define _mm512_cmpunord_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
-#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)
-
-#define _mm512_cmpneq_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
-#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)
-
-#define _mm512_cmpnlt_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
-#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)
-
-#define _mm512_cmpnle_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
-#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)
-
-#define _mm512_cmpord_ps_mask(A, B) \
- _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
-#define _mm512_mask_cmpord_ps_mask(k, A, B) \
- _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)
-
-#define _mm512_cmp_round_pd_mask(A, B, P, R) \
- ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(P), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \
- ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(P), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_cmp_pd_mask(A, B, P) \
- _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
- _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_cmpeq_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
-#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)
-
-#define _mm512_cmplt_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
-#define _mm512_mask_cmplt_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)
-
-#define _mm512_cmple_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
-#define _mm512_mask_cmple_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)
-
-#define _mm512_cmpunord_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
-#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)
-
-#define _mm512_cmpneq_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
-#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)
-
-#define _mm512_cmpnlt_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
-#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)
-
-#define _mm512_cmpnle_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
-#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)
-
-#define _mm512_cmpord_pd_mask(A, B) \
- _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
-#define _mm512_mask_cmpord_pd_mask(k, A, B) \
- _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)
-
-/* Conversion */
-
-#define _mm512_cvtt_roundps_epu32(A, R) \
- ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_undefined_epi32(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U), (int)(R)))
-
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epu32(__m512 __A)
-{
- return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
- (__v16si) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
- (__v16si) _mm512_setzero_si512 (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi32_ps(A, R) \
- ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \
- ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \
- ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_cvt_roundepu32_ps(A, R) \
- ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \
- ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \
- ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_ps (__m512i __A)
-{
- return (__m512)__builtin_convertvector((__v16su)__A, __v16sf);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_cvtepu32_ps(__A),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_cvtepu32_ps(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_pd(__m256i __A)
-{
- return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_cvtepi32_pd(__A),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_cvtepi32_pd(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32lo_pd(__m512i __A)
-{
- return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
-{
- return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_ps (__m512i __A)
-{
- return (__m512)__builtin_convertvector((__v16si)__A, __v16sf);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_cvtepi32_ps(__A),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_cvtepi32_ps(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_pd(__m256i __A)
-{
- return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_cvtepu32_pd(__A),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_cvtepu32_pd(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32lo_pd(__m512i __A)
-{
- return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
-{
- return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
-}
-
-#define _mm512_cvt_roundpd_ps(A, R) \
- ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \
- ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
- (__v8sf)(__m256)(W), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \
- ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_ps (__m512d __A)
-{
- return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
- (__v8sf) _mm256_undefined_ps (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
-{
- return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
- (__v8sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
-{
- return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
- (__v8sf) _mm256_setzero_ps (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_pslo (__m512d __A)
-{
- return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
- (__v8sf) _mm256_setzero_ps (),
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
-{
- return (__m512) __builtin_shufflevector (
- (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
- __U, __A),
- (__v8sf) _mm256_setzero_ps (),
- 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-#define _mm512_cvt_roundps_ph(A, I) \
- ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
- (__v16hi)_mm256_undefined_si256(), \
- (__mmask16)-1))
-
-#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
- ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
- (__v16hi)(__m256i)(U), \
- (__mmask16)(W)))
-
-#define _mm512_maskz_cvt_roundps_ph(W, A, I) \
- ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
- (__v16hi)_mm256_setzero_si256(), \
- (__mmask16)(W)))
-
-#define _mm512_cvtps_ph _mm512_cvt_roundps_ph
-#define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph
-#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph
-
-#define _mm512_cvt_roundph_ps(A, R) \
- ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \
- ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_ps(U, A, R) \
- ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_cvtph_ps(__m256i __A)
-{
- return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
-{
- return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
-{
- return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
- (__v16sf) _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundpd_epi32(A, R) \
- ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \
- ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \
- ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epi32(__m512d __a)
-{
- return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
- (__v8si)_mm256_setzero_si256(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
- (__v8si) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
- (__v8si) _mm256_setzero_si256 (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundps_epi32(A, R) \
- ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \
- ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttps_epi32(__m512 __a)
-{
- return (__m512i)
- __builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
- (__v16si) _mm512_setzero_si512 (),
- (__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
- (__v16si) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
- (__v16si) _mm512_setzero_si512 (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epi32(A, R) \
- ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epi32 (__m512 __A)
-{
- return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
- (__v16si) _mm512_undefined_epi32 (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
- (__v16si) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epi32(A, R) \
- ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \
- ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \
- ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epi32 (__m512d __A)
-{
- return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_undefined_si256 (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
- (__v8si) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundps_epu32(A, R) \
- ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
- (__v16si)(__m512i)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \
- ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
- (__v16si)_mm512_setzero_si512(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtps_epu32 ( __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
- (__v16si)\
- _mm512_undefined_epi32 (),
- (__mmask16) -1,\
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
- (__v16si) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
-{
- return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U ,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundpd_epu32(A, R) \
- ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \
- ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \
- ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtpd_epu32 (__m512d __A)
-{
- return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_undefined_si256 (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
- (__v8si) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_cvtsd_f64(__m512d __a)
-{
- return __a[0];
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_cvtss_f32(__m512 __a)
-{
- return __a[0];
-}
-
-/* Unpack and Interleave */
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_pd(__m512d __a, __m512d __b)
-{
- return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
- 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_unpackhi_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_unpackhi_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_pd(__m512d __a, __m512d __b)
-{
- return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
- 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_unpacklo_pd(__A, __B),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
- (__v8df)_mm512_unpacklo_pd(__A, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_ps(__m512 __a, __m512 __b)
-{
- return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
- 2, 18, 3, 19,
- 2+4, 18+4, 3+4, 19+4,
- 2+8, 18+8, 3+8, 19+8,
- 2+12, 18+12, 3+12, 19+12);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
- (__v16sf)_mm512_unpackhi_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
- (__v16sf)_mm512_unpackhi_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_ps(__m512 __a, __m512 __b)
-{
- return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
- 0, 16, 1, 17,
- 0+4, 16+4, 1+4, 17+4,
- 0+8, 16+8, 1+8, 17+8,
- 0+12, 16+12, 1+12, 17+12);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
- (__v16sf)_mm512_unpacklo_ps(__A, __B),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
- (__v16sf)_mm512_unpacklo_ps(__A, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
- 2, 18, 3, 19,
- 2+4, 18+4, 3+4, 19+4,
- 2+8, 18+8, 3+8, 19+8,
- 2+12, 18+12, 3+12, 19+12);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
- (__v16si)_mm512_unpackhi_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
- (__v16si)_mm512_unpackhi_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
- 0, 16, 1, 17,
- 0+4, 16+4, 1+4, 17+4,
- 0+8, 16+8, 1+8, 17+8,
- 0+12, 16+12, 1+12, 17+12);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
- (__v16si)_mm512_unpacklo_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
- (__v16si)_mm512_unpacklo_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
- 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
- (__v8di)_mm512_unpackhi_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
- (__v8di)_mm512_unpackhi_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
- 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
- (__v8di)_mm512_unpacklo_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
- (__v8di)_mm512_unpacklo_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-
-/* SIMD load ops */
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_si512 (void const *__P)
-{
- struct __loadu_si512 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_si512*)__P)->__v;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi32 (void const *__P)
-{
- struct __loadu_epi32 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi32*)__P)->__v;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
- (__v16si) __W,
- (__mmask16) __U);
-}
-
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_loadu_epi64 (void const *__P)
-{
- struct __loadu_epi64 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi64*)__P)->__v;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
- (__v8di) __W,
- (__mmask8) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
-{
- return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
- (__v16sf) __W,
- (__mmask16) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
-{
- return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
-{
- return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
- (__v8df) __W,
- (__mmask8) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
-{
- return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_loadu_pd(void const *__p)
-{
- struct __loadu_pd {
- __m512d_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_pd*)__p)->__v;
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_loadu_ps(void const *__p)
-{
- struct __loadu_ps {
- __m512_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_ps*)__p)->__v;
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_load_ps(void const *__p)
-{
- return *(const __m512*)__p;
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
-{
- return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
- (__v16sf) __W,
- (__mmask16) __U);
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
-{
- return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_load_pd(void const *__p)
-{
- return *(const __m512d*)__p;
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
-{
- return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
- (__v8df) __W,
- (__mmask8) __U);
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
-{
- return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_load_si512 (void const *__P)
-{
- return *(const __m512i *) __P;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_load_epi32 (void const *__P)
-{
- return *(const __m512i *) __P;
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_load_epi64 (void const *__P)
-{
- return *(const __m512i *) __P;
-}
-
-/* SIMD store ops */
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi64 (void *__P, __m512i __A)
-{
- struct __storeu_epi64 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi64*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
-{
- __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_si512 (void *__P, __m512i __A)
-{
- struct __storeu_si512 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_si512*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_epi32 (void *__P, __m512i __A)
-{
- struct __storeu_epi32 {
- __m512i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi32*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
-{
- __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
- (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
-{
- __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_pd(void *__P, __m512d __A)
-{
- struct __storeu_pd {
- __m512d_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_pd*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
-{
- __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
- (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_storeu_ps(void *__P, __m512 __A)
-{
- struct __storeu_ps {
- __m512_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_ps*)__P)->__v = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
-{
- __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_pd(void *__P, __m512d __A)
-{
- *(__m512d*)__P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
-{
- __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
- (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_ps(void *__P, __m512 __A)
-{
- *(__m512*)__P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_si512 (void *__P, __m512i __A)
-{
- *(__m512i *) __P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_epi32 (void *__P, __m512i __A)
-{
- *(__m512i *) __P = __A;
-}
-
-static __inline void __DEFAULT_FN_ATTRS512
-_mm512_store_epi64 (void *__P, __m512i __A)
-{
- *(__m512i *) __P = __A;
-}
-
-/* Mask ops */
-
-static __inline __mmask16 __DEFAULT_FN_ATTRS
-_mm512_knot(__mmask16 __M)
-{
- return __builtin_ia32_knothi(__M);
-}
-
-/* Integer compare */
-
-#define _mm512_cmpeq_epi32_mask(A, B) \
- _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \
- _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi32_mask(A, B) \
- _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi32_mask(k, A, B) \
- _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi32_mask(A, B) \
- _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \
- _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi32_mask(A, B) \
- _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi32_mask(k, A, B) \
- _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi32_mask(A, B) \
- _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi32_mask(k, A, B) \
- _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi32_mask(A, B) \
- _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \
- _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu32_mask(A, B) \
- _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \
- _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu32_mask(A, B) \
- _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu32_mask(k, A, B) \
- _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu32_mask(A, B) \
- _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \
- _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu32_mask(A, B) \
- _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu32_mask(k, A, B) \
- _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu32_mask(A, B) \
- _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu32_mask(k, A, B) \
- _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu32_mask(A, B) \
- _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \
- _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epi64_mask(A, B) \
- _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \
- _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epi64_mask(A, B) \
- _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epi64_mask(k, A, B) \
- _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epi64_mask(A, B) \
- _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \
- _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epi64_mask(A, B) \
- _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epi64_mask(k, A, B) \
- _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epi64_mask(A, B) \
- _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epi64_mask(k, A, B) \
- _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epi64_mask(A, B) \
- _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \
- _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm512_cmpeq_epu64_mask(A, B) \
- _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \
- _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm512_cmpge_epu64_mask(A, B) \
- _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm512_mask_cmpge_epu64_mask(k, A, B) \
- _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm512_cmpgt_epu64_mask(A, B) \
- _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \
- _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm512_cmple_epu64_mask(A, B) \
- _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm512_mask_cmple_epu64_mask(k, A, B) \
- _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm512_cmplt_epu64_mask(A, B) \
- _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm512_mask_cmplt_epu64_mask(k, A, B) \
- _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm512_cmpneq_epu64_mask(A, B) \
- _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \
- _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi8_epi32(__m128i __A)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepi8_epi32(__A),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepi8_epi32(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi8_epi64(__m128i __A)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepi8_epi64(__A),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepi8_epi64(__A),
- (__v8di)_mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_epi64(__m256i __X)
-{
- return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepi32_epi64(__X),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepi32_epi64(__X),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_epi32(__m256i __A)
-{
- return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepi16_epi32(__A),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepi16_epi32(__A),
- (__v16si)_mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_epi64(__m128i __A)
-{
- return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepi16_epi64(__A),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepi16_epi64(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu8_epi32(__m128i __A)
-{
- return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepu8_epi32(__A),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepu8_epi32(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu8_epi64(__m128i __A)
-{
- return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepu8_epi64(__A),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepu8_epi64(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_epi64(__m256i __X)
-{
- return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepu32_epi64(__X),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepu32_epi64(__X),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu16_epi32(__m256i __A)
-{
- return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepu16_epi32(__A),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_cvtepu16_epi32(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtepu16_epi64(__m128i __A)
-{
- return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepu16_epi64(__A),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_cvtepu16_epi64(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rorv_epi32 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_rorv_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_rorv_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rorv_epi64 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_rorv_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_rorv_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-
-
-#define _mm512_cmp_epi32_mask(a, b, p) \
- ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
- (__v16si)(__m512i)(b), (int)(p), \
- (__mmask16)-1))
-
-#define _mm512_cmp_epu32_mask(a, b, p) \
- ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
- (__v16si)(__m512i)(b), (int)(p), \
- (__mmask16)-1))
-
-#define _mm512_cmp_epi64_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
- (__v8di)(__m512i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm512_cmp_epu64_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
- (__v8di)(__m512i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \
- ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
- (__v16si)(__m512i)(b), (int)(p), \
- (__mmask16)(m)))
-
-#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \
- ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
- (__v16si)(__m512i)(b), (int)(p), \
- (__mmask16)(m)))
-
-#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
- (__v8di)(__m512i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
- (__v8di)(__m512i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm512_rol_epi32(a, b) \
- ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b)))
-
-#define _mm512_mask_rol_epi32(W, U, a, b) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_rol_epi32((a), (b)), \
- (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_rol_epi32(U, a, b) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_rol_epi32((a), (b)), \
- (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_rol_epi64(a, b) \
- ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b)))
-
-#define _mm512_mask_rol_epi64(W, U, a, b) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_rol_epi64((a), (b)), \
- (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_rol_epi64(U, a, b) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_rol_epi64((a), (b)), \
- (__v8di)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rolv_epi32 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_rolv_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_rolv_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_rolv_epi64 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_rolv_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_rolv_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_ror_epi32(A, B) \
- ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B)))
-
-#define _mm512_mask_ror_epi32(W, U, A, B) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_ror_epi32((A), (B)), \
- (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_ror_epi32(U, A, B) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_ror_epi32((A), (B)), \
- (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_ror_epi64(A, B) \
- ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B)))
-
-#define _mm512_mask_ror_epi64(W, U, A, B) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_ror_epi64((A), (B)), \
- (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_ror_epi64(U, A, B) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_ror_epi64((A), (B)), \
- (__v8di)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi32(__m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_slli_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_slli_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_slli_epi64(__m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_slli_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_slli_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi32(__m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srli_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) {
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srli_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srli_epi64(__m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srli_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srli_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
- (__v16si) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
-{
- __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
- (__v16si) __A,
- (__v16si) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
- (__v16si) __A,
- (__v16si) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
- (__v8di) __A,
- (__v8di) __W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
- (__v8di) __A,
- (__v8di) _mm512_setzero_si512 ());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
- (__v8di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
-{
- __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_movedup_pd (__m512d __A)
-{
- return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
- 0, 0, 2, 2, 4, 4, 6, 6);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_movedup_pd(__A),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_movedup_pd(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \
- ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8di)(__m512i)(C), (int)(imm), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \
- ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8di)(__m512i)(C), (int)(imm), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_fixupimm_pd(A, B, C, imm) \
- ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8di)(__m512i)(C), (int)(imm), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \
- ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8di)(__m512i)(C), (int)(imm), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \
- ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8di)(__m512i)(C), \
- (int)(imm), (__mmask8)(U), \
- (int)(R)))
-
-#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \
- ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8di)(__m512i)(C), \
- (int)(imm), (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \
- ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16si)(__m512i)(C), (int)(imm), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \
- ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16si)(__m512i)(C), (int)(imm), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_fixupimm_ps(A, B, C, imm) \
- ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16si)(__m512i)(C), (int)(imm), \
- (__mmask16)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \
- ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16si)(__m512i)(C), (int)(imm), \
- (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \
- ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16si)(__m512i)(C), \
- (int)(imm), (__mmask16)(U), \
- (int)(R)))
-
-#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \
- ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16si)(__m512i)(C), \
- (int)(imm), (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_fixupimm_round_sd(A, B, C, imm, R) \
- ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \
- ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_fixupimm_sd(A, B, C, imm) \
- ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \
- ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \
- ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \
- ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_fixupimm_round_ss(A, B, C, imm, R) \
- ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \
- ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_fixupimm_ss(A, B, C, imm) \
- ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \
- ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \
- ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \
- ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_getexp_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_getexp_sd (__m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
- (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_getexp_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) _mm_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_getexp_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_getexp_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_getexp_ss (__m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
- (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_getexp_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) _mm_setzero_ps (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_getexp_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_getmant_round_sd(A, B, C, D, R) \
- ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(((D)<<2) | (C)), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_getmant_sd(A, B, C, D) \
- ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(((D)<<2) | (C)), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_sd(W, U, A, B, C, D) \
- ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(((D)<<2) | (C)), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \
- ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(((D)<<2) | (C)), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_getmant_sd(U, A, B, C, D) \
- ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(((D)<<2) | (C)), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \
- ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (int)(((D)<<2) | (C)), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_getmant_round_ss(A, B, C, D, R) \
- ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (int)(((D)<<2) | (C)), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_getmant_ss(A, B, C, D) \
- ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (int)(((D)<<2) | (C)), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_ss(W, U, A, B, C, D) \
- ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (int)(((D)<<2) | (C)), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \
- ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (int)(((D)<<2) | (C)), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_getmant_ss(U, A, B, C, D) \
- ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (int)(((D)<<2) | (C)), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \
- ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (int)(((D)<<2) | (C)), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kmov (__mmask16 __A)
-{
- return __A;
-}
-
-#define _mm_comi_round_sd(A, B, P, R) \
- ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
- (int)(P), (int)(R)))
-
-#define _mm_comi_round_ss(A, B, P, R) \
- ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
- (int)(P), (int)(R)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsd_si64(A, R) \
- ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
-#endif
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sll_epi32(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sll_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sll_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sll_epi64(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sll_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sll_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sllv_epi32(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sllv_epi32(__X, __Y),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sllv_epi32(__X, __Y),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sllv_epi64(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sllv_epi64(__X, __Y),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sllv_epi64(__X, __Y),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sra_epi32(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sra_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_sra_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_sra_epi64(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sra_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_sra_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srav_epi32(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srav_epi32(__X, __Y),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srav_epi32(__X, __Y),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srav_epi64(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srav_epi64(__X, __Y),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srav_epi64(__X, __Y),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srl_epi32(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srl_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srl_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srl_epi64(__m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srl_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srl_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srlv_epi32(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srlv_epi32(__X, __Y),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srlv_epi32(__X, __Y),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srlv_epi64(__X, __Y),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srlv_epi64(__X, __Y),
- (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_ternarylogic_epi32(A, B, C, imm) \
- ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), \
- (__v16si)(__m512i)(C), (int)(imm), \
- (__mmask16)-1))
-
-#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \
- ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), \
- (__v16si)(__m512i)(C), (int)(imm), \
- (__mmask16)(U)))
-
-#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \
- ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), \
- (__v16si)(__m512i)(C), \
- (int)(imm), (__mmask16)(U)))
-
-#define _mm512_ternarylogic_epi64(A, B, C, imm) \
- ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), \
- (__v8di)(__m512i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \
- ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), \
- (__v8di)(__m512i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \
- ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), \
- (__v8di)(__m512i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsd_i64(A, R) \
- ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)))
-#endif
-
-#define _mm_cvt_roundsd_si32(A, R) \
- ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvt_roundsd_i32(A, R) \
- ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvt_roundsd_u32(A, R) \
- ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvtsd_u32 (__m128d __A)
-{
- return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsd_u64(A, R) \
- ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
- (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvtsd_u64 (__m128d __A)
-{
- return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
- __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvt_roundss_si32(A, R) \
- ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvt_roundss_i32(A, R) \
- ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundss_si64(A, R) \
- ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvt_roundss_i64(A, R) \
- ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)))
-#endif
-
-#define _mm_cvt_roundss_u32(A, R) \
- ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvtss_u32 (__m128 __A)
-{
- return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundss_u64(A, R) \
- ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
- (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvtss_u64 (__m128 __A)
-{
- return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
- __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundsd_i32(A, R) \
- ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvtt_roundsd_si32(A, R) \
- ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128
-_mm_cvttsd_i32 (__m128d __A)
-{
- return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsd_si64(A, R) \
- ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
-
-#define _mm_cvtt_roundsd_i64(A, R) \
- ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128
-_mm_cvttsd_i64 (__m128d __A)
-{
- return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundsd_u32(A, R) \
- ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvttsd_u32 (__m128d __A)
-{
- return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsd_u64(A, R) \
- ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
- (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvttsd_u64 (__m128d __A)
-{
- return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
- __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundss_i32(A, R) \
- ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvtt_roundss_si32(A, R) \
- ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128
-_mm_cvttss_i32 (__m128 __A)
-{
- return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundss_i64(A, R) \
- ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
-
-#define _mm_cvtt_roundss_si64(A, R) \
- ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128
-_mm_cvttss_i64 (__m128 __A)
-{
- return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundss_u32(A, R) \
- ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)))
-
-static __inline__ unsigned __DEFAULT_FN_ATTRS128
-_mm_cvttss_u32 (__m128 __A)
-{
- return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundss_u64(A, R) \
- ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
- (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvttss_u64 (__m128 __A)
-{
- return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
- __A,
- _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm512_permute_pd(X, C) \
- ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C)))
-
-#define _mm512_mask_permute_pd(W, U, X, C) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_permute_pd((X), (C)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_permute_pd(U, X, C) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_permute_pd((X), (C)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_permute_ps(X, C) \
- ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C)))
-
-#define _mm512_mask_permute_ps(W, U, X, C) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_permute_ps((X), (C)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_permute_ps(U, X, C) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_permute_ps((X), (C)), \
- (__v16sf)_mm512_setzero_ps()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutevar_pd(__m512d __A, __m512i __C)
-{
- return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_permutevar_pd(__A, __C),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_permutevar_pd(__A, __C),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutevar_ps(__m512 __A, __m512i __C)
-{
- return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_permutevar_ps(__A, __C),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_permutevar_ps(__A, __C),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
-{
- return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I,
- (__v8df)__B);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
- (__v8df)__A);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U,
- __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
- (__v8df)(__m512d)__I);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I,
- __m512d __B)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__U,
- (__v8df)_mm512_permutex2var_pd(__A, __I, __B),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
-{
- return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I,
- (__v16sf) __B);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
- (__v16sf)__A);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
- (__v16sf)(__m512)__I);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B)
-{
- return (__m512)__builtin_ia32_selectps_512(__U,
- (__v16sf)_mm512_permutex2var_ps(__A, __I, __B),
- (__v16sf)_mm512_setzero_ps());
-}
-
-
-#define _mm512_cvtt_roundpd_epu32(A, R) \
- ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_undefined_si256(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \
- ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
- (__v8si)(__m256i)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \
- ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
- (__v8si)_mm256_setzero_si256(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvttpd_epu32 (__m512d __A)
-{
- return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_undefined_si256 (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
- (__v8si) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
-{
- return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_roundscale_round_sd(A, B, imm, R) \
- ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(imm), \
- (int)(R)))
-
-#define _mm_roundscale_sd(A, B, imm) \
- ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(imm), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_sd(W, U, A, B, imm) \
- ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(imm), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \
- ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(I), \
- (int)(R)))
-
-#define _mm_maskz_roundscale_sd(U, A, B, I) \
- ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(I), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
- ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(I), \
- (int)(R)))
-
-#define _mm_roundscale_round_ss(A, B, imm, R) \
- ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(imm), \
- (int)(R)))
-
-#define _mm_roundscale_ss(A, B, imm) \
- ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(imm), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_ss(W, U, A, B, I) \
- ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U), (int)(I), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \
- ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U), (int)(I), \
- (int)(R)))
-
-#define _mm_maskz_roundscale_ss(U, A, B, I) \
- ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(I), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
- ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(I), \
- (int)(R)))
-
-#define _mm512_scalef_round_pd(A, B, R) \
- ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \
- ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_scalef_round_pd(U, A, B, R) \
- ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_scalef_pd (__m512d __A, __m512d __B)
-{
- return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
-{
- return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
- (__v8df) __B,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_scalef_round_ps(A, B, R) \
- ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \
- ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_scalef_round_ps(U, A, B, R) \
- ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_scalef_ps (__m512 __A, __m512 __B)
-{
- return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
-{
- return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
- (__v16sf) __B,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_scalef_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_scalef_sd (__m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
- (__v2df)( __B), (__v2df) _mm_setzero_pd(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_scalef_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) _mm_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_scalef_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_scalef_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_scalef_ss (__m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
- (__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_scalef_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) _mm_setzero_ps (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_scalef_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi32(__m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A,
- unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srai_epi32(__A, __B),
- (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A,
- unsigned int __B) {
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
- (__v16si)_mm512_srai_epi32(__A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_srai_epi64(__m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srai_epi64(__A, __B),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
- (__v8di)_mm512_srai_epi64(__A, __B),
- (__v8di)_mm512_setzero_si512());
-}
-
-#define _mm512_shuffle_f32x4(A, B, imm) \
- ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \
- (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_shuffle_f64x2(A, B, imm) \
- ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_shuffle_i32x4(A, B, imm) \
- ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
- (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \
- (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shuffle_i64x2(A, B, imm) \
- ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(imm)))
-
-#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
- (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \
- (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shuffle_pd(A, B, M) \
- ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(B), (int)(M)))
-
-#define _mm512_mask_shuffle_pd(W, U, A, B, M) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_shuffle_pd(U, A, B, M) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_shuffle_pd((A), (B), (M)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_shuffle_ps(A, B, M) \
- ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(B), (int)(M)))
-
-#define _mm512_mask_shuffle_ps(W, U, A, B, M) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_shuffle_ps(U, A, B, M) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
- (__v16sf)_mm512_setzero_ps()))
-
-#define _mm_sqrt_round_sd(A, B, R) \
- ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
- (__v2df) __B,
- (__v2df) _mm_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_sqrt_round_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_sqrt_round_ss(A, B, R) \
- ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(W), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) _mm_setzero_ps (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_sqrt_round_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f32x4(__m128 __A)
-{
- return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
- 0, 1, 2, 3, 0, 1, 2, 3,
- 0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
- (__v16sf)_mm512_broadcast_f32x4(__A),
- (__v16sf)__O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
- (__v16sf)_mm512_broadcast_f32x4(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_broadcast_f64x4(__m256d __A)
-{
- return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
- 0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
- (__v8df)_mm512_broadcast_f64x4(__A),
- (__v8df)__O);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
- (__v8df)_mm512_broadcast_f64x4(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i32x4(__m128i __A)
-{
- return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
- 0, 1, 2, 3, 0, 1, 2, 3,
- 0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_broadcast_i32x4(__A),
- (__v16si)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_broadcast_i32x4(__A),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_broadcast_i64x4(__m256i __A)
-{
- return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
- 0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_broadcast_i64x4(__A),
- (__v8di)__O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_broadcast_i64x4(__A),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__M,
- (__v8df) _mm512_broadcastsd_pd(__A),
- (__v8df) __O);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512(__M,
- (__v8df) _mm512_broadcastsd_pd(__A),
- (__v8df) _mm512_setzero_pd());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
-{
- return (__m512)__builtin_ia32_selectps_512(__M,
- (__v16sf) _mm512_broadcastss_ps(__A),
- (__v16sf) __O);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
-{
- return (__m512)__builtin_ia32_selectps_512(__M,
- (__v16sf) _mm512_broadcastss_ps(__A),
- (__v16sf) _mm512_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi32_epi8 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
- (__v16qi) _mm_undefined_si128 (),
- (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
-{
- __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi32_epi16 (__m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_undefined_si256 (),
- (__mmask16) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
- (__v16hi) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_setzero_si256 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
-{
- __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi64_epi8 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
- (__v16qi) _mm_undefined_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi64_epi32 (__m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
- (__v8si) _mm256_undefined_si256 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
- (__v8si) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
- (__v8si) _mm256_setzero_si256 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtsepi64_epi16 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
- (__v8hi) _mm_undefined_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi32_epi8 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
- (__v16qi) _mm_undefined_si128 (),
- (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
-{
- __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi32_epi16 (__m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_undefined_si256 (),
- (__mmask16) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
- (__v16hi) __O,
- __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_setzero_si256 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
-{
- __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi64_epi8 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
- (__v16qi) _mm_undefined_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi64_epi32 (__m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
- (__v8si) _mm256_undefined_si256 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
- (__v8si) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
- (__v8si) _mm256_setzero_si256 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtusepi64_epi16 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
- (__v8hi) _mm_undefined_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_epi8 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
- (__v16qi) _mm_undefined_si128 (),
- (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
-{
- __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_epi16 (__m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_undefined_si256 (),
- (__mmask16) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
- (__v16hi) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
- (__v16hi) _mm256_setzero_si256 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
-{
- __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_epi8 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
- (__v16qi) _mm_undefined_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_epi32 (__m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
- (__v8si) _mm256_undefined_si256 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
- (__v8si) __O, __M);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
-{
- return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
- (__v8si) _mm256_setzero_si256 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_epi16 (__m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
- (__v8hi) _mm_undefined_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
-{
- return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
-{
- __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
-}
-
-#define _mm512_extracti32x4_epi32(A, imm) \
- ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)_mm_undefined_si128(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
- ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \
- ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)))
-
-#define _mm512_extracti64x4_epi64(A, imm) \
- ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)_mm256_undefined_si256(), \
- (__mmask8)-1))
-
-#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
- ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)(__m256i)(W), \
- (__mmask8)(U)))
-
-#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \
- ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
- (__v4di)_mm256_setzero_si256(), \
- (__mmask8)(U)))
-
-#define _mm512_insertf64x4(A, B, imm) \
- ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \
- (__v4df)(__m256d)(B), (int)(imm)))
-
-#define _mm512_mask_insertf64x4(W, U, A, B, imm) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_insertf64x4(U, A, B, imm) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_insertf64x4((A), (B), (imm)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_inserti64x4(A, B, imm) \
- ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \
- (__v4di)(__m256i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti64x4(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
- (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_inserti64x4(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_inserti64x4((A), (B), (imm)), \
- (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_insertf32x4(A, B, imm) \
- ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \
- (__v4sf)(__m128)(B), (int)(imm)))
-
-#define _mm512_mask_insertf32x4(W, U, A, B, imm) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
- (__v16sf)(__m512)(W)))
-
-#define _mm512_maskz_insertf32x4(U, A, B, imm) \
- ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
- (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
- (__v16sf)_mm512_setzero_ps()))
-
-#define _mm512_inserti32x4(A, B, imm) \
- ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \
- (__v4si)(__m128i)(B), (int)(imm)))
-
-#define _mm512_mask_inserti32x4(W, U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
- (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_inserti32x4(U, A, B, imm) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_inserti32x4((A), (B), (imm)), \
- (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_getmant_round_pd(A, B, C, R) \
- ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \
- ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \
- ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_getmant_pd(A, B, C) \
- ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_getmant_pd(W, U, A, B, C) \
- ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_getmant_pd(U, A, B, C) \
- ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_getmant_round_ps(A, B, C, R) \
- ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
- (int)(((C)<<2) | (B)), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \
- ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
- (int)(((C)<<2) | (B)), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \
- ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
- (int)(((C)<<2) | (B)), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_getmant_ps(A, B, C) \
- ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
- (int)(((C)<<2)|(B)), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_getmant_ps(W, U, A, B, C) \
- ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
- (int)(((C)<<2)|(B)), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_getmant_ps(U, A, B, C) \
- ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
- (int)(((C)<<2)|(B)), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_getexp_round_pd(A, R) \
- ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_getexp_round_pd(W, U, A, R) \
- ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_getexp_round_pd(U, A, R) \
- ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_getexp_pd (__m512d __A)
-{
- return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
- (__v8df) _mm512_undefined_pd (),
- (__mmask8) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
- (__v8df) __W,
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_getexp_round_ps(A, R) \
- ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_getexp_round_ps(W, U, A, R) \
- ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)(__m512)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_getexp_round_ps(U, A, R) \
- ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
- (__v16sf)_mm512_setzero_ps(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_getexp_ps (__m512 __A)
-{
- return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
- (__v16sf) _mm512_undefined_ps (),
- (__mmask16) -1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
- (__v16sf) __W,
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
- (__v16sf) _mm512_setzero_ps (),
- (__mmask16) __U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_i64gather_ps(index, addr, scale) \
- ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
- (void const *)(addr), \
- (__v8di)(__m512i)(index), (__mmask8)-1, \
- (int)(scale)))
-
-#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \
- ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
- (void const *)(addr), \
- (__v8di)(__m512i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64gather_epi32(index, addr, scale) \
- ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \
- (void const *)(addr), \
- (__v8di)(__m512i)(index), \
- (__mmask8)-1, (int)(scale)))
-
-#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \
- ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
- (void const *)(addr), \
- (__v8di)(__m512i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64gather_pd(index, addr, scale) \
- ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
- (void const *)(addr), \
- (__v8di)(__m512i)(index), (__mmask8)-1, \
- (int)(scale)))
-
-#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \
- ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
- (void const *)(addr), \
- (__v8di)(__m512i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64gather_epi64(index, addr, scale) \
- ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \
- (void const *)(addr), \
- (__v8di)(__m512i)(index), (__mmask8)-1, \
- (int)(scale)))
-
-#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \
- ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
- (void const *)(addr), \
- (__v8di)(__m512i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i32gather_ps(index, addr, scale) \
- ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
- (void const *)(addr), \
- (__v16si)(__m512)(index), \
- (__mmask16)-1, (int)(scale)))
-
-#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
- ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
- (void const *)(addr), \
- (__v16si)(__m512)(index), \
- (__mmask16)(mask), (int)(scale)))
-
-#define _mm512_i32gather_epi32(index, addr, scale) \
- ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
- (void const *)(addr), \
- (__v16si)(__m512i)(index), \
- (__mmask16)-1, (int)(scale)))
-
-#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \
- ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
- (void const *)(addr), \
- (__v16si)(__m512i)(index), \
- (__mmask16)(mask), (int)(scale)))
-
-#define _mm512_i32gather_pd(index, addr, scale) \
- ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
- (void const *)(addr), \
- (__v8si)(__m256i)(index), (__mmask8)-1, \
- (int)(scale)))
-
-#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \
- ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
- (void const *)(addr), \
- (__v8si)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i32gather_epi64(index, addr, scale) \
- ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
- (void const *)(addr), \
- (__v8si)(__m256i)(index), (__mmask8)-1, \
- (int)(scale)))
-
-#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \
- ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
- (void const *)(addr), \
- (__v8si)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm512_i64scatter_ps(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \
- (__v8di)(__m512i)(index), \
- (__v8sf)(__m256)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \
- (__v8di)(__m512i)(index), \
- (__v8sf)(__m256)(v1), (int)(scale))
-
-#define _mm512_i64scatter_epi32(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \
- (__v8di)(__m512i)(index), \
- (__v8si)(__m256i)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \
- (__v8di)(__m512i)(index), \
- (__v8si)(__m256i)(v1), (int)(scale))
-
-#define _mm512_i64scatter_pd(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \
- (__v8di)(__m512i)(index), \
- (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \
- (__v8di)(__m512i)(index), \
- (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_i64scatter_epi64(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \
- (__v8di)(__m512i)(index), \
- (__v8di)(__m512i)(v1), (int)(scale))
-
-#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \
- (__v8di)(__m512i)(index), \
- (__v8di)(__m512i)(v1), (int)(scale))
-
-#define _mm512_i32scatter_ps(addr, index, v1, scale) \
- __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \
- (__v16si)(__m512i)(index), \
- (__v16sf)(__m512)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \
- (__v16si)(__m512i)(index), \
- (__v16sf)(__m512)(v1), (int)(scale))
-
-#define _mm512_i32scatter_epi32(addr, index, v1, scale) \
- __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \
- (__v16si)(__m512i)(index), \
- (__v16si)(__m512i)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \
- (__v16si)(__m512i)(index), \
- (__v16si)(__m512i)(v1), (int)(scale))
-
-#define _mm512_i32scatter_pd(addr, index, v1, scale) \
- __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \
- (__v8si)(__m256i)(index), \
- (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \
- (__v8si)(__m256i)(index), \
- (__v8df)(__m512d)(v1), (int)(scale))
-
-#define _mm512_i32scatter_epi64(addr, index, v1, scale) \
- __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \
- (__v8si)(__m256i)(index), \
- (__v8di)(__m512i)(v1), (int)(scale))
-
-#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \
- (__v8si)(__m256i)(index), \
- (__v8di)(__m512i)(v1), (int)(scale))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
- (__v4sf)__A,
- (__v4sf)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_ss(A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
- (__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
- (__v4sf)__B,
- (__v4sf)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(C), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
- (__v4sf)__X,
- (__v4sf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
- (__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
- (__v4sf)__A,
- -(__v4sf)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmsub_round_ss(A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- -(__v4sf)(__m128)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
- (__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
- (__v4sf)__B,
- -(__v4sf)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- -(__v4sf)(__m128)(C), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
- (__v4sf)__X,
- (__v4sf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \
- ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
- (__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
- -(__v4sf)__A,
- (__v4sf)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmadd_round_ss(A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
- -(__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
- -(__v4sf)__B,
- (__v4sf)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- (__v4sf)(__m128)(C), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W,
- -(__v4sf)__X,
- (__v4sf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
- -(__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return __builtin_ia32_vfmaddss3_mask((__v4sf)__W,
- -(__v4sf)__A,
- -(__v4sf)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmsub_round_ss(A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- -(__v4sf)(__m128)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
- -(__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A,
- -(__v4sf)__B,
- -(__v4sf)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
- ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
- -(__v4sf)(__m128)(B), \
- -(__v4sf)(__m128)(C), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W,
- -(__v4sf)__X,
- (__v4sf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \
- ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
- -(__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
- (__v2df)__A,
- (__v2df)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_sd(A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
- (__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
- (__v2df)__B,
- (__v2df)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(C), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
- (__v2df)__X,
- (__v2df)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
- (__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
- (__v2df)__A,
- -(__v2df)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmsub_round_sd(A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- -(__v2df)(__m128d)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
- (__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
- (__v2df)__B,
- -(__v2df)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- -(__v2df)(__m128d)(C), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
- (__v2df)__X,
- (__v2df)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \
- ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
- (__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
- -(__v2df)__A,
- (__v2df)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmadd_round_sd(A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
- -(__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
- -(__v2df)__B,
- (__v2df)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- (__v2df)(__m128d)(C), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W,
- -(__v2df)__X,
- (__v2df)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
- -(__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return __builtin_ia32_vfmaddsd3_mask((__v2df)__W,
- -(__v2df)__A,
- -(__v2df)__B,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmsub_round_sd(A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- -(__v2df)(__m128d)(C), (__mmask8)-1, \
- (int)(R)))
-
-#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
- -(__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A,
- -(__v2df)__B,
- -(__v2df)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
- ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
- -(__v2df)(__m128d)(B), \
- -(__v2df)(__m128d)(C), \
- (__mmask8)(U), \
- (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
-{
- return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W,
- -(__v2df)__X,
- (__v2df)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \
- ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
- -(__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_permutex_pd(X, C) \
- ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C)))
-
-#define _mm512_mask_permutex_pd(W, U, X, C) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_permutex_pd((X), (C)), \
- (__v8df)(__m512d)(W)))
-
-#define _mm512_maskz_permutex_pd(U, X, C) \
- ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
- (__v8df)_mm512_permutex_pd((X), (C)), \
- (__v8df)_mm512_setzero_pd()))
-
-#define _mm512_permutex_epi64(X, C) \
- ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C)))
-
-#define _mm512_mask_permutex_epi64(W, U, X, C) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_permutex_epi64((X), (C)), \
- (__v8di)(__m512i)(W)))
-
-#define _mm512_maskz_permutex_epi64(U, X, C) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_permutex_epi64((X), (C)), \
- (__v8di)_mm512_setzero_si512()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
-{
- return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_permutexvar_pd(__X, __Y),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_permutexvar_pd(__X, __Y),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_permutexvar_epi64(__X, __Y),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
- __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
- (__v8di)_mm512_permutexvar_epi64(__X, __Y),
- (__v8di)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
-{
- return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_permutexvar_ps(__X, __Y),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_permutexvar_ps(__X, __Y),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X);
-}
-
-#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_permutexvar_epi32(__X, __Y),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
- __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
- (__v16si)_mm512_permutexvar_epi32(__X, __Y),
- (__v16si)__W);
-}
-
-#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kand (__mmask16 __A, __mmask16 __B)
-{
- return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kandn (__mmask16 __A, __mmask16 __B)
-{
- return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kor (__mmask16 __A, __mmask16 __B)
-{
- return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_kortestc (__mmask16 __A, __mmask16 __B)
-{
- return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm512_kortestz (__mmask16 __A, __mmask16 __B)
-{
- return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
- return (unsigned char)__builtin_ia32_kortestchi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B)
-{
- return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) {
- *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B);
- return (unsigned char)__builtin_ia32_kortestzhi(__A, __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
-{
- return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kxnor (__mmask16 __A, __mmask16 __B)
-{
- return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_mm512_kxor (__mmask16 __A, __mmask16 __B)
-{
- return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
-}
-
-#define _kand_mask16 _mm512_kand
-#define _kandn_mask16 _mm512_kandn
-#define _knot_mask16 _mm512_knot
-#define _kor_mask16 _mm512_kor
-#define _kxnor_mask16 _mm512_kxnor
-#define _kxor_mask16 _mm512_kxor
-
-#define _kshiftli_mask16(A, I) \
- ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I)))
-
-#define _kshiftri_mask16(A, I) \
- ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_cvtmask16_u32(__mmask16 __A) {
- return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_cvtu32_mask16(unsigned int __A) {
- return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS
-_load_mask16(__mmask16 *__A) {
- return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_store_mask16(__mmask16 *__A, __mmask16 __B) {
- *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_si512 (void * __P, __m512i __A)
-{
- typedef __v8di __v8di_aligned __attribute__((aligned(64)));
- __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_stream_load_si512 (void const *__P)
-{
- typedef __v8di __v8di_aligned __attribute__((aligned(64)));
- return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_pd (void *__P, __m512d __A)
-{
- typedef __v8df __v8df_aligned __attribute__((aligned(64)));
- __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_ps (void *__P, __m512 __A)
-{
- typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
- __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
- (__v8df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
- (__v8df)
- _mm512_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
- (__v8di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
- (__v8di)
- _mm512_setzero_si512 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
- (__v16sf) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
- (__v16sf)
- _mm512_setzero_ps (),
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
- (__v16si) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
- (__v16si)
- _mm512_setzero_si512 (),
- (__mmask16) __U);
-}
-
-#define _mm_cmp_round_ss_mask(X, Y, P, R) \
- ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (int)(P), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
- ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (int)(P), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_cmp_ss_mask(X, Y, P) \
- ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (int)(P), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_ss_mask(M, X, Y, P) \
- ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (int)(P), \
- (__mmask8)(M), \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_cmp_round_sd_mask(X, Y, P, R) \
- ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (int)(P), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
- ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (int)(P), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_cmp_sd_mask(X, Y, P) \
- ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (int)(P), \
- (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_sd_mask(M, X, Y, P) \
- ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (int)(P), \
- (__mmask8)(M), \
- _MM_FROUND_CUR_DIRECTION))
-
-/* Bit Test */
-
-static __inline __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_test_epi32_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_test_epi64_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
-{
- return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS512
-_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B),
- _mm512_setzero_si512());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_movehdup_ps (__m512 __A)
-{
- return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
- 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_movehdup_ps(__A),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_movehdup_ps(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_moveldup_ps (__m512 __A)
-{
- return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
- 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_moveldup_ps(__A),
- (__v16sf)__W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_moveldup_ps(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
-{
- return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B),
- _mm_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
-{
- return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B),
- _mm_setzero_pd());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
-{
- __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
-{
- __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
-{
- __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
- (__v4sf)_mm_setzero_ps(),
- 0, 4, 4, 4);
-
- return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_load_ss (__mmask8 __U, const float* __A)
-{
- return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A,
- (__v4sf) _mm_setzero_ps(),
- __U & 1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
-{
- __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
- (__v2df)_mm_setzero_pd(),
- 0, 2);
-
- return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_load_sd (__mmask8 __U, const double* __A)
-{
- return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A,
- (__v2df) _mm_setzero_pd(),
- __U & 1);
-}
-
-#define _mm512_shuffle_epi32(A, I) \
- ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I)))
-
-#define _mm512_mask_shuffle_epi32(W, U, A, I) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shuffle_epi32((A), (I)), \
- (__v16si)(__m512i)(W)))
-
-#define _mm512_maskz_shuffle_epi32(U, A, I) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shuffle_epi32((A), (I)), \
- (__v16si)_mm512_setzero_si512()))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
- (__v8df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
- (__v8df) _mm512_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
- (__v8di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
- (__v8di) _mm512_setzero_si512 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
-{
- return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
- (__v8df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
-{
- return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
- (__v8df) _mm512_setzero_pd(),
- (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
- (__v8di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
- (__v8di) _mm512_setzero_si512(),
- (__mmask8) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
-{
- return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
- (__v16sf) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
-{
- return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
- (__v16sf) _mm512_setzero_ps(),
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
- (__v16si) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
- (__v16si) _mm512_setzero_si512(),
- (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
- (__v16sf) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
- (__v16sf) _mm512_setzero_ps(),
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
- (__v16si) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
-{
- return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
- (__v16si) _mm512_setzero_si512(),
- (__mmask16) __U);
-}
-
-#define _mm512_cvt_roundps_pd(A, R) \
- ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
- (__v8df)_mm512_undefined_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \
- ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
- (__v8df)(__m512d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundps_pd(U, A, R) \
- ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
- (__v8df)_mm512_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtps_pd (__m256 __A)
-{
- return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_cvtps_pd(__A),
- (__v8df)__W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
-{
- return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
- (__v8df)_mm512_cvtps_pd(__A),
- (__v8df)_mm512_setzero_pd());
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_cvtpslo_pd (__m512 __A)
-{
- return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
-{
- return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
- (__v8df) __A,
- (__v8df) __W);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
-{
- return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
- (__v8df) __A,
- (__v8df) _mm512_setzero_pd ());
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
- (__v16sf) __A,
- (__v16sf) __W);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
-{
- return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
- (__v16sf) __A,
- (__v16sf) _mm512_setzero_ps ());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
-{
- __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
-{
- __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
-{
- __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
- (__mmask16) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
-{
- __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
- (__mmask16) __U);
-}
-
-#define _mm_cvt_roundsd_ss(A, B, R) \
- ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
- (__v2df)(__m128d)(B), \
- (__v4sf)_mm_undefined_ps(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
- (__v2df)(__m128d)(B), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
- (__v2df)(__m128d)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
-{
- return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
- (__v2df)__B,
- (__v4sf)__W,
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
-{
- return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A,
- (__v2df)__B,
- (__v4sf)_mm_setzero_ps(),
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvtss_i32 _mm_cvtss_si32
-#define _mm_cvtsd_i32 _mm_cvtsd_si32
-#define _mm_cvti32_sd _mm_cvtsi32_sd
-#define _mm_cvti32_ss _mm_cvtsi32_ss
-#ifdef __x86_64__
-#define _mm_cvtss_i64 _mm_cvtss_si64
-#define _mm_cvtsd_i64 _mm_cvtsd_si64
-#define _mm_cvti64_sd _mm_cvtsi64_sd
-#define _mm_cvti64_ss _mm_cvtsi64_ss
-#endif
-
-#ifdef __x86_64__
-#define _mm_cvt_roundi64_sd(A, B, R) \
- ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
- (int)(R)))
-
-#define _mm_cvt_roundsi64_sd(A, B, R) \
- ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
- (int)(R)))
-#endif
-
-#define _mm_cvt_roundsi32_ss(A, B, R) \
- ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
-
-#define _mm_cvt_roundi32_ss(A, B, R) \
- ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)))
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsi64_ss(A, B, R) \
- ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
- (int)(R)))
-
-#define _mm_cvt_roundi64_ss(A, B, R) \
- ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
- (int)(R)))
-#endif
-
-#define _mm_cvt_roundss_sd(A, B, R) \
- ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
- (__v4sf)(__m128)(B), \
- (__v2df)_mm_undefined_pd(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
- (__v4sf)(__m128)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
- (__v4sf)(__m128)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
-{
- return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
- (__v4sf)__B,
- (__v2df)__W,
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
-{
- return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A,
- (__v4sf)__B,
- (__v2df)_mm_setzero_pd(),
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtu32_sd (__m128d __A, unsigned __B)
-{
- __A[0] = __B;
- return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundu64_sd(A, B, R) \
- ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
- (unsigned long long)(B), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
-{
- __A[0] = __B;
- return __A;
-}
-#endif
-
-#define _mm_cvt_roundu32_ss(A, B, R) \
- ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
- (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtu32_ss (__m128 __A, unsigned __B)
-{
- __A[0] = __B;
- return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundu64_ss(A, B, R) \
- ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
- (unsigned long long)(B), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
-{
- __A[0] = __B;
- return __A;
-}
-#endif
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
-{
- return (__m512i) __builtin_ia32_selectd_512(__M,
- (__v16si) _mm512_set1_epi32(__A),
- (__v16si) __O);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
-{
- return (__m512i) __builtin_ia32_selectq_512(__M,
- (__v8di) _mm512_set1_epi64(__A),
- (__v8di) __O);
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
- char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
- char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
- char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
- char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
- char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
- char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
- char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
- char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
- char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
- char __e4, char __e3, char __e2, char __e1, char __e0) {
-
- return __extension__ (__m512i)(__v64qi)
- {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
- __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
- __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
- __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
- __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
- __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
- __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
- __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
- short __e27, short __e26, short __e25, short __e24, short __e23,
- short __e22, short __e21, short __e20, short __e19, short __e18,
- short __e17, short __e16, short __e15, short __e14, short __e13,
- short __e12, short __e11, short __e10, short __e9, short __e8,
- short __e7, short __e6, short __e5, short __e4, short __e3,
- short __e2, short __e1, short __e0) {
- return __extension__ (__m512i)(__v32hi)
- {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
- __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
- __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
- __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
-}
-
-static __inline __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi32 (int __A, int __B, int __C, int __D,
- int __E, int __F, int __G, int __H,
- int __I, int __J, int __K, int __L,
- int __M, int __N, int __O, int __P)
-{
- return __extension__ (__m512i)(__v16si)
- { __P, __O, __N, __M, __L, __K, __J, __I,
- __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \
- e8,e9,e10,e11,e12,e13,e14,e15) \
- _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
- (e5),(e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_set_epi64 (long long __A, long long __B, long long __C,
- long long __D, long long __E, long long __F,
- long long __G, long long __H)
-{
- return __extension__ (__m512i) (__v8di)
- { __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \
- _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_set_pd (double __A, double __B, double __C, double __D,
- double __E, double __F, double __G, double __H)
-{
- return __extension__ (__m512d)
- { __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \
- _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_set_ps (float __A, float __B, float __C, float __D,
- float __E, float __F, float __G, float __H,
- float __I, float __J, float __K, float __L,
- float __M, float __N, float __O, float __P)
-{
- return __extension__ (__m512)
- { __P, __O, __N, __M, __L, __K, __J, __I,
- __H, __G, __F, __E, __D, __C, __B, __A };
-}
-
-#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
- _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
- (e4),(e3),(e2),(e1),(e0))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_abs_ps(__m512 __A)
-{
- return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
-{
- return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_abs_pd(__m512d __A)
-{
- return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
-{
- return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
-}
-
-/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
- * outputs. This class of vector operation forms the basis of many scientific
- * computations. In vector-reduction arithmetic, the evaluation order is
- * independent of the order of the input elements of V.
-
- * For floating-point intrinsics:
- * 1. When using fadd/fmul intrinsics, the order of operations within the
- * vector is unspecified (associative math).
- * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector
- * produce unspecified results.
-
- * Used bisection method. At each step, we partition the vector with previous
- * step in half, and the operation is performed on its two halves.
- * This takes log2(n) steps where n is the number of elements in the vector.
- */
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) {
-#if (__clang_major__ > 14)
- return __builtin_reduce_add((__v8di)__W);
-#else
- return __builtin_ia32_reduce_add_q512(__W);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) {
-#if (__clang_major__ > 14)
- return __builtin_reduce_mul((__v8di)__W);
-#else
- return __builtin_ia32_reduce_mul_q512(__W);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_and_q512(__W);
-#else
- return __builtin_reduce_and((__v8di)__W);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_or_q512(__W);
-#else
- return __builtin_reduce_or((__v8di)__W);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
- __W = _mm512_maskz_mov_epi64(__M, __W);
-#if (__clang_major__ > 14)
- return __builtin_reduce_add((__v8di)__W);
-#else
- return __builtin_ia32_reduce_add_q512(__W);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
- __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W);
-#if (__clang_major__ > 14)
- return __builtin_reduce_mul((__v8di)__W);
-#else
- return __builtin_ia32_reduce_mul_q512(__W);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
- __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_and_q512(__W);
-#else
- return __builtin_reduce_and((__v8di)__W);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
- __W = _mm512_maskz_mov_epi64(__M, __W);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_or_q512(__W);
-#else
- return __builtin_reduce_or((__v8di)__W);
-#endif
-}
-
-// -0.0 is used to ignore the start value since it is the neutral value of
-// floating point addition. For more information, please refer to
-// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic
-static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) {
- return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) {
- return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
- __W = _mm512_maskz_mov_pd(__M, __W);
- return __builtin_ia32_reduce_fadd_pd512(-0.0, __W);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
- __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W);
- return __builtin_ia32_reduce_fmul_pd512(1.0, __W);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_add_epi32(__m512i __W) {
-#if (__clang_major__ > 14)
- return __builtin_reduce_add((__v16si)__W);
-#else
- return __builtin_ia32_reduce_add_d512((__v16si)__W);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_mul_epi32(__m512i __W) {
-#if (__clang_major__ > 14)
- return __builtin_reduce_mul((__v16si)__W);
-#else
- return __builtin_ia32_reduce_mul_d512((__v16si)__W);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_and_epi32(__m512i __W) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_and_d512((__v16si)__W);
-#else
- return __builtin_reduce_and((__v16si)__W);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_or_epi32(__m512i __W) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_or_d512((__v16si)__W);
-#else
- return __builtin_reduce_or((__v16si)__W);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
- __W = _mm512_maskz_mov_epi32(__M, __W);
-#if (__clang_major__ > 14)
- return __builtin_reduce_add((__v16si)__W);
-#else
- return __builtin_ia32_reduce_add_d512((__v16si)__W);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
- __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W);
-#if (__clang_major__ > 14)
- return __builtin_reduce_mul((__v16si)__W);
-#else
- return __builtin_ia32_reduce_mul_d512((__v16si)__W);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
- __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_and_d512((__v16si)__W);
-#else
- return __builtin_reduce_and((__v16si)__W);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
- __W = _mm512_maskz_mov_epi32(__M, __W);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_or_d512((__v16si)__W);
-#else
- return __builtin_reduce_or((__v16si)__W);
-#endif
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_add_ps(__m512 __W) {
- return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_mul_ps(__m512 __W) {
- return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
- __W = _mm512_maskz_mov_ps(__M, __W);
- return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
- __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W);
- return __builtin_ia32_reduce_fmul_ps512(1.0f, __W);
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epi64(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smax_q512(__V);
-#else
- return __builtin_reduce_max((__v8di)__V);
-#endif
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epu64(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umax_q512(__V);
-#else
- return __builtin_reduce_max((__v8du)__V);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epi64(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smin_q512(__V);
-#else
- return __builtin_reduce_min((__v8di)__V);
-#endif
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epu64(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umin_q512(__V);
-#else
- return __builtin_reduce_min((__v8du)__V);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smax_q512(__V);
-#else
- return __builtin_reduce_max((__v8di)__V);
-#endif
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
- __V = _mm512_maskz_mov_epi64(__M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umax_q512(__V);
-#else
- return __builtin_reduce_max((__v8du)__V);
-#endif
-}
-
-static __inline__ long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smin_q512(__V);
-#else
- return __builtin_reduce_min((__v8di)__V);
-#endif
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umin_q512(__V);
-#else
- return __builtin_reduce_min((__v8du)__V);
-#endif
-}
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epi32(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smax_d512((__v16si)__V);
-#else
- return __builtin_reduce_max((__v16si)__V);
-#endif
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_epu32(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umax_d512((__v16si)__V);
-#else
- return __builtin_reduce_max((__v16su)__V);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epi32(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smin_d512((__v16si)__V);
-#else
- return __builtin_reduce_min((__v16si)__V);
-#endif
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_epu32(__m512i __V) {
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umin_d512((__v16si)__V);
-#else
- return __builtin_reduce_min((__v16su)__V);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smax_d512((__v16si)__V);
-#else
- return __builtin_reduce_max((__v16si)__V);
-#endif
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
- __V = _mm512_maskz_mov_epi32(__M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umax_d512((__v16si)__V);
-#else
- return __builtin_reduce_max((__v16su)__V);
-#endif
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_smin_d512((__v16si)__V);
-#else
- return __builtin_reduce_min((__v16si)__V);
-#endif
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
- __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V);
-#if (__clang_major__ < 14)
- return __builtin_ia32_reduce_umin_d512((__v16si)__V);
-#else
- return __builtin_reduce_min((__v16su)__V);
-#endif
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_pd(__m512d __V) {
- return __builtin_ia32_reduce_fmax_pd512(__V);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_pd(__m512d __V) {
- return __builtin_ia32_reduce_fmin_pd512(__V);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
- __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
- return __builtin_ia32_reduce_fmax_pd512(__V);
-}
-
-static __inline__ double __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
- __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
- return __builtin_ia32_reduce_fmin_pd512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_ps(__m512 __V) {
- return __builtin_ia32_reduce_fmax_ps512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_ps(__m512 __V) {
- return __builtin_ia32_reduce_fmin_ps512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
- __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
- return __builtin_ia32_reduce_fmax_ps512(__V);
-}
-
-static __inline__ float __DEFAULT_FN_ATTRS512
-_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
- __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
- return __builtin_ia32_reduce_fmin_ps512(__V);
-}
-
-/// Moves the least significant 32 bits of a vector of [16 x i32] to a
-/// 32-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __A
-/// A vector of [16 x i32]. The least significant 32 bits are moved to the
-/// destination.
-/// \returns A 32-bit signed integer containing the moved value.
-static __inline__ int __DEFAULT_FN_ATTRS512
-_mm512_cvtsi512_si32(__m512i __A) {
- __v16si __b = (__v16si)__A;
- return __b[0];
-}
-
-/// Loads 8 double-precision (64-bit) floating-point elements stored at memory
-/// locations starting at location \a base_addr at packed 32-bit integer indices
-/// stored in the lower half of \a vindex scaled by \a scale them in dst.
-///
-/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// dst[i+63:i] := MEM[addr+63:addr]
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endoperation
-#define _mm512_i32logather_pd(vindex, base_addr, scale) \
- _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale))
-
-/// Loads 8 double-precision (64-bit) floating-point elements from memory
-/// starting at location \a base_addr at packed 32-bit integer indices stored in
-/// the lower half of \a vindex scaled by \a scale into dst using writemask
-/// \a mask (elements are copied from \a src when the corresponding mask bit is
-/// not set).
-///
-/// This intrinsic corresponds to the <c> VGATHERDPD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// IF mask[j]
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// dst[i+63:i] := MEM[addr+63:addr]
-/// ELSE
-/// dst[i+63:i] := src[i+63:i]
-/// FI
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endoperation
-#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \
- _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \
- (base_addr), (scale))
-
-/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
-/// at packed 32-bit integer indices stored in the lower half of \a vindex
-/// scaled by \a scale and stores them in dst.
-///
-/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// dst[i+63:i] := MEM[addr+63:addr]
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endoperation
-#define _mm512_i32logather_epi64(vindex, base_addr, scale) \
- _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale))
-
-/// Loads 8 64-bit integer elements from memory starting at location \a base_addr
-/// at packed 32-bit integer indices stored in the lower half of \a vindex
-/// scaled by \a scale and stores them in dst using writemask \a mask (elements
-/// are copied from \a src when the corresponding mask bit is not set).
-///
-/// This intrinsic corresponds to the <c> VPGATHERDQ </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// IF mask[j]
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// dst[i+63:i] := MEM[addr+63:addr]
-/// ELSE
-/// dst[i+63:i] := src[i+63:i]
-/// FI
-/// ENDFOR
-/// dst[MAX:512] := 0
-/// \endoperation
-#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \
- _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \
- (base_addr), (scale))
-
-/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
-/// and to memory locations starting at location \a base_addr at packed 32-bit
-/// integer indices stored in \a vindex scaled by \a scale.
-///
-/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// MEM[addr+63:addr] := v1[i+63:i]
-/// ENDFOR
-/// \endoperation
-#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \
- _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale))
-
-/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1
-/// to memory locations starting at location \a base_addr at packed 32-bit
-/// integer indices stored in \a vindex scaled by \a scale. Only those elements
-/// whose corresponding mask bit is set in writemask \a mask are written to
-/// memory.
-///
-/// This intrinsic corresponds to the <c> VSCATTERDPD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// IF mask[j]
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// MEM[addr+63:addr] := a[i+63:i]
-/// FI
-/// ENDFOR
-/// \endoperation
-#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \
- _mm512_mask_i32scatter_pd((base_addr), (mask), \
- _mm512_castsi512_si256(vindex), (v1), (scale))
-
-/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in
-/// memory locations starting at location \a base_addr at packed 32-bit integer
-/// indices stored in \a vindex scaled by \a scale.
-///
-/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// MEM[addr+63:addr] := a[i+63:i]
-/// ENDFOR
-/// \endoperation
-#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \
- _mm512_i32scatter_epi64((base_addr), \
- _mm512_castsi512_si256(vindex), (v1), (scale))
-
-/// Stores 8 packed 64-bit integer elements located in a and stores them in
-/// memory locations starting at location \a base_addr at packed 32-bit integer
-/// indices stored in \a vindex scaled by scale using writemask \a mask (elements
-/// whose corresponding mask bit is not set are not written to memory).
-///
-/// This intrinsic corresponds to the <c> VPSCATTERDQ </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// i := j*64
-/// m := j*32
-/// IF mask[j]
-/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8
-/// MEM[addr+63:addr] := a[i+63:i]
-/// FI
-/// ENDFOR
-/// \endoperation
-#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \
- _mm512_mask_i32scatter_epi64((base_addr), (mask), \
- _mm512_castsi512_si256(vindex), (v1), (scale))
-
-#undef __DEFAULT_FN_ATTRS512
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __AVX512FINTRIN_H */
+++ /dev/null
-/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512FP16INTRIN_H
-#define __AVX512FP16INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
-typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
-typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
-typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
-typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS512 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
- __min_vector_width__(512)))
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
- __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \
- __min_vector_width__(128)))
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) {
- return __a[0];
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) {
- return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) {
- return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) {
- return (__m256h)__builtin_ia32_undef256();
-}
-
-static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) {
- return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
- 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) {
- return (__m128h)__builtin_ia32_undef128();
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) {
- return (__m512h)__builtin_ia32_undef512();
-}
-
-static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) {
- return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h,
- __h, __h, __h, __h, __h, __h, __h, __h,
- __h, __h, __h, __h, __h, __h, __h, __h,
- __h, __h, __h, __h, __h, __h, __h, __h};
-}
-
-static __inline __m512h __DEFAULT_FN_ATTRS512
-_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
- _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
- _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
- _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16,
- _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20,
- _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24,
- _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28,
- _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) {
- return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26,
- __h25, __h24, __h23, __h22, __h21, __h20, __h19,
- __h18, __h17, __h16, __h15, __h14, __h13, __h12,
- __h11, __h10, __h9, __h8, __h7, __h6, __h5,
- __h4, __h3, __h2, __h1};
-}
-
-#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
- h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \
- h25, h26, h27, h28, h29, h30, h31, h32) \
- _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \
- (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \
- (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \
- (h5), (h4), (h3), (h2), (h1))
-
-static __inline __m512h __DEFAULT_FN_ATTRS512
-_mm512_set1_pch(_Float16 _Complex h) {
- return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h));
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) {
- return (__m128)__a;
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) {
- return (__m256)__a;
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) {
- return (__m512)__a;
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) {
- return (__m128d)__a;
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) {
- return (__m256d)__a;
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) {
- return (__m512d)__a;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) {
- return (__m128i)__a;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_castph_si256(__m256h __a) {
- return (__m256i)__a;
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_castph_si512(__m512h __a) {
- return (__m512i)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) {
- return (__m128h)__a;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) {
- return (__m256h)__a;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) {
- return (__m512h)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) {
- return (__m128h)__a;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) {
- return (__m256h)__a;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) {
- return (__m512h)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) {
- return (__m128h)__a;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_castsi256_ph(__m256i __a) {
- return (__m256h)__a;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_castsi512_ph(__m512i __a) {
- return (__m512h)__a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_castph256_ph128(__m256h __a) {
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_castph512_ph128(__m512h __a) {
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_castph512_ph256(__m512h __a) {
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 12, 13, 14, 15);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_castph128_ph256(__m128h __a) {
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
- -1, -1, -1, -1, -1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_castph128_ph512(__m128h __a) {
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_castph256_ph512(__m256h __a) {
- return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1);
-}
-
-/// Constructs a 256-bit floating-point vector of [16 x half] from a
-/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
-/// contain the value of the source vector. The upper 384 bits are set
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x half].
-/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits
-/// contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_zextph128_ph256(__m128h __a) {
- return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4,
- 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-/// Constructs a 512-bit floating-point vector of [32 x half] from a
-/// 128-bit floating-point vector of [8 x half]. The lower 128 bits
-/// contain the value of the source vector. The upper 384 bits are set
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x half].
-/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits
-/// contain the value of the parameter. The upper 384 bits are set to zero.
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_zextph128_ph512(__m128h __a) {
- return __builtin_shufflevector(
- __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
- 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15);
-}
-
-/// Constructs a 512-bit floating-point vector of [32 x half] from a
-/// 256-bit floating-point vector of [16 x half]. The lower 256 bits
-/// contain the value of the source vector. The upper 256 bits are set
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit vector of [16 x half].
-/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits
-/// contain the value of the parameter. The upper 256 bits are set to zero.
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_zextph256_ph512(__m256h __a) {
- return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3,
- 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
- 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
- 29, 30, 31);
-}
-
-#define _mm_comi_round_sh(A, B, P, R) \
- __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R))
-
-#define _mm_comi_sh(A, B, pred) \
- _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION)
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A,
- __m128h B) {
- return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A,
- __m512h __B) {
- return (__m512h)((__v32hf)__A + (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
- (__v32hf)_mm512_add_ph(__A, __B),
- (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_add_round_ph(A, B, R) \
- ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_add_round_ph(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
- (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_add_round_ph(U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \
- (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A,
- __m512h __B) {
- return (__m512h)((__v32hf)__A - (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
- (__v32hf)_mm512_sub_ph(__A, __B),
- (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_sub_round_ph(A, B, R) \
- ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_sub_round_ph(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
- (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_sub_round_ph(U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \
- (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A,
- __m512h __B) {
- return (__m512h)((__v32hf)__A * (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
- (__v32hf)_mm512_mul_ph(__A, __B),
- (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_mul_round_ph(A, B, R) \
- ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_mul_round_ph(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
- (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_mul_round_ph(U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \
- (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A,
- __m512h __B) {
- return (__m512h)((__v32hf)__A / (__v32hf)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
- (__v32hf)_mm512_div_ph(__A, __B),
- (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_div_round_ph(A, B, R) \
- ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_div_round_ph(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
- (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_div_round_ph(U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \
- (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A,
- __m512h __B) {
- return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
- (__v32hf)_mm512_min_ph(__A, __B),
- (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_min_round_ph(A, B, R) \
- ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_min_round_ph(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
- (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_min_round_ph(U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \
- (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A,
- __m512h __B) {
- return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U,
- (__v32hf)_mm512_max_ph(__A, __B),
- (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm512_max_round_ph(A, B, R) \
- ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(R)))
-
-#define _mm512_mask_max_round_ph(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
- (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_max_round_ph(U, A, B, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \
- (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) {
- return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) {
- return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f));
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_selectps_512(
- (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U,
- (__v16sf)_mm512_conj_pch(__A),
- (__v16sf)_mm512_setzero_ps());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A,
- __m128h __B) {
- __A[0] += __B[0];
- return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_add_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_add_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_add_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_addsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_add_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_addsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_add_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_addsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A,
- __m128h __B) {
- __A[0] -= __B[0];
- return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_sub_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_sub_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_sub_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_subsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sub_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_subsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_sub_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_subsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A,
- __m128h __B) {
- __A[0] *= __B[0];
- return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_mul_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_mul_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_mul_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_mulsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_mul_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_mulsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_mul_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_mulsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A,
- __m128h __B) {
- __A[0] /= __B[0];
- return __A;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_div_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- __A = _mm_div_sh(__A, __B);
- return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph());
-}
-
-#define _mm_div_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_divsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_div_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_divsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_div_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_divsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_minsh_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_minsh_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_min_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_minsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_min_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_minsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_min_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_minsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_maxsh_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_maxsh_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_max_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_maxsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_max_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_maxsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_max_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_maxsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_cmp_round_ph_mask(A, B, P, R) \
- ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(P), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \
- ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \
- (__v32hf)(__m512h)(B), (int)(P), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_cmp_ph_mask(A, B, P) \
- _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm512_mask_cmp_ph_mask(U, A, B, P) \
- _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)
-
-#define _mm_cmp_round_sh_mask(X, Y, P, R) \
- ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
- (__v8hf)(__m128h)(Y), (int)(P), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \
- ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \
- (__v8hf)(__m128h)(Y), (int)(P), \
- (__mmask8)(M), (int)(R)))
-
-#define _mm_cmp_sh_mask(X, Y, P) \
- ((__mmask8)__builtin_ia32_cmpsh_mask( \
- (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_cmp_sh_mask(M, X, Y, P) \
- ((__mmask8)__builtin_ia32_cmpsh_mask( \
- (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \
- _MM_FROUND_CUR_DIRECTION))
-// loads with vmovsh:
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
- struct __mm_load_sh_struct {
- _Float16 __u;
- } __attribute__((__packed__, __may_alias__));
- _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
- return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
- __m128h src = (__v8hf)__builtin_shufflevector(
- (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
-
- return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_load_sh(__mmask8 __U, const void *__A) {
- return (__m128h)__builtin_ia32_loadsh128_mask(
- (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_load_ph(void const *__p) {
- return *(const __m512h *)__p;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_load_ph(void const *__p) {
- return *(const __m256h *)__p;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) {
- return *(const __m128h *)__p;
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_loadu_ph(void const *__p) {
- struct __loadu_ph {
- __m512h_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_ph *)__p)->__v;
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_loadu_ph(void const *__p) {
- struct __loadu_ph {
- __m256h_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_ph *)__p)->__v;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) {
- struct __loadu_ph {
- __m128h_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_ph *)__p)->__v;
-}
-
-// stores with vmovsh:
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp,
- __m128h __a) {
- struct __mm_store_sh_struct {
- _Float16 __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_store_sh_struct *)__dp)->__u = __a[0];
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W,
- __mmask8 __U,
- __m128h __A) {
- __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P,
- __m512h __A) {
- *(__m512h *)__P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P,
- __m256h __A) {
- *(__m256h *)__P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P,
- __m128h __A) {
- *(__m128h *)__P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P,
- __m512h __A) {
- struct __storeu_ph {
- __m512h_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_ph *)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P,
- __m256h __A) {
- struct __storeu_ph {
- __m256h_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_ph *)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P,
- __m128h __A) {
- struct __storeu_ph {
- __m128h_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_ph *)__P)->__v = __A;
-}
-
-// moves with vmovsh:
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a,
- __m128h __b) {
- __a[0] = __b[0];
- return __a;
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B),
- _mm_setzero_ph());
-}
-
-// vmovw:
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) {
- return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0};
-}
-
-static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) {
- __v8hi __b = (__v8hi)__a;
- return __b[0];
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) {
- return (__m512h)__builtin_ia32_rcpph512_mask(
- (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W,
- (__mmask32)__U);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_rcpph512_mask(
- (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) {
- return (__m512h)__builtin_ia32_rsqrtph512_mask(
- (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W,
- (__mmask32)__U);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_rsqrtph512_mask(
- (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U);
-}
-
-#define _mm512_getmant_ph(A, B, C) \
- ((__m512h)__builtin_ia32_getmantph512_mask( \
- (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
- (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_getmant_ph(W, U, A, B, C) \
- ((__m512h)__builtin_ia32_getmantph512_mask( \
- (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
- (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_getmant_ph(U, A, B, C) \
- ((__m512h)__builtin_ia32_getmantph512_mask( \
- (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
- (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_getmant_round_ph(A, B, C, R) \
- ((__m512h)__builtin_ia32_getmantph512_mask( \
- (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
- (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \
- ((__m512h)__builtin_ia32_getmantph512_mask( \
- (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_getmantph512_mask( \
- (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \
- (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) {
- return (__m512h)__builtin_ia32_getexpph512_mask(
- (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_getexpph512_mask(
- (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_getexpph512_mask(
- (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_getexp_round_ph(A, R) \
- ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
- (__v32hf)_mm512_undefined_ph(), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_getexp_round_ph(W, U, A, R) \
- ((__m512h)__builtin_ia32_getexpph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_getexp_round_ph(U, A, R) \
- ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
- (__v32hf)_mm512_setzero_ph(), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A,
- __m512h __B) {
- return (__m512h)__builtin_ia32_scalefph512_mask(
- (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B,
- (__v32hf)__W, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_scalefph512_mask(
- (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_scalef_round_ph(A, B, R) \
- ((__m512h)__builtin_ia32_scalefph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
- (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_scalefph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_scalef_round_ph(U, A, B, R) \
- ((__m512h)__builtin_ia32_scalefph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \
- (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-#define _mm512_roundscale_ph(A, B) \
- ((__m512h)__builtin_ia32_rndscaleph_mask( \
- (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \
- _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_ph(A, B, C, imm) \
- ((__m512h)__builtin_ia32_rndscaleph_mask( \
- (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \
- (__mmask32)(B), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_roundscale_ph(A, B, imm) \
- ((__m512h)__builtin_ia32_rndscaleph_mask( \
- (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
- (__mmask32)(A), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \
- ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \
- (__v32hf)(__m512h)(A), \
- (__mmask32)(B), (int)(R)))
-
-#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \
- ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \
- (__v32hf)_mm512_setzero_ph(), \
- (__mmask32)(A), (int)(R)))
-
-#define _mm512_roundscale_round_ph(A, imm, R) \
- ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \
- (__v32hf)_mm512_undefined_ph(), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_reduce_ph(A, imm) \
- ((__m512h)__builtin_ia32_reduceph512_mask( \
- (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \
- (__mmask32)-1, _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_ph(W, U, A, imm) \
- ((__m512h)__builtin_ia32_reduceph512_mask( \
- (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \
- (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_maskz_reduce_ph(U, A, imm) \
- ((__m512h)__builtin_ia32_reduceph512_mask( \
- (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \
- (__mmask32)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \
- ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
- (__v32hf)(__m512h)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \
- ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
- (__v32hf)_mm512_setzero_ph(), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_reduce_round_ph(A, imm, R) \
- ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \
- (__v32hf)_mm512_undefined_ph(), \
- (__mmask32)-1, (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_rcpsh_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_rcpsh_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_rsqrtsh_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_rsqrtsh_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-#define _mm_getmant_round_sh(A, B, C, D, R) \
- ((__m128h)__builtin_ia32_getmantsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
- (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R)))
-
-#define _mm_getmant_sh(A, B, C, D) \
- ((__m128h)__builtin_ia32_getmantsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
- (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_sh(W, U, A, B, C, D) \
- ((__m128h)__builtin_ia32_getmantsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
- (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \
- ((__m128h)__builtin_ia32_getmantsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
- (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_getmant_sh(U, A, B, C, D) \
- ((__m128h)__builtin_ia32_getmantsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
- (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \
- ((__m128h)__builtin_ia32_getmantsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \
- (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-#define _mm_getexp_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_getexpsh128_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_getexpsh128_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_getexp_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_getexpsh128_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_getexp_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_getexpsh128_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_scalef_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_scalefsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_scalefsh_round_mask(
- (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask_scalef_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_scalefsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_scalefsh_round_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_scalef_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_scalefsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_roundscale_round_sh(A, B, imm, R) \
- ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(imm), (int)(R)))
-
-#define _mm_roundscale_sh(A, B, imm) \
- ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_sh(W, U, A, B, I) \
- ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \
- ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(I), (int)(R)))
-
-#define _mm_maskz_roundscale_sh(U, A, B, I) \
- ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \
- ((__m128h)__builtin_ia32_rndscalesh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(I), (int)(R)))
-
-#define _mm_reduce_sh(A, B, C) \
- ((__m128h)__builtin_ia32_reducesh_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_mask_reduce_sh(W, U, A, B, C) \
- ((__m128h)__builtin_ia32_reducesh_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_maskz_reduce_sh(U, A, B, C) \
- ((__m128h)__builtin_ia32_reducesh_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION))
-
-#define _mm_reduce_round_sh(A, B, C, R) \
- ((__m128h)__builtin_ia32_reducesh_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(C), (int)(R)))
-
-#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \
- ((__m128h)__builtin_ia32_reducesh_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(C), (int)(R)))
-
-#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \
- ((__m128h)__builtin_ia32_reducesh_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(C), (int)(R)))
-
-#define _mm512_sqrt_round_ph(A, R) \
- ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R)))
-
-#define _mm512_mask_sqrt_round_ph(W, U, A, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
- (__v32hf)(__m512h)(W)))
-
-#define _mm512_maskz_sqrt_round_ph(U, A, R) \
- ((__m512h)__builtin_ia32_selectph_512( \
- (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \
- (__v32hf)_mm512_setzero_ph()))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) {
- return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)(__U),
- (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
- (__v32hf)(__m512h)(__W));
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) {
- return (__m512h)__builtin_ia32_selectph_512(
- (__mmask32)(__U),
- (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)),
- (__v32hf)_mm512_setzero_ph());
-}
-
-#define _mm_sqrt_round_sh(A, B, R) \
- ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_sqrt_round_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_sqrtsh_round_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_sqrtsh_round_mask(
- (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
- (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W,
- __mmask32 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_sqrtsh_round_mask(
- (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W),
- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_sqrtsh_round_mask(
- (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(),
- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fpclass_ph_mask(U, A, imm) \
- ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
- (int)(imm), (__mmask32)(U)))
-
-#define _mm512_fpclass_ph_mask(A, imm) \
- ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \
- (int)(imm), (__mmask32)-1))
-
-#define _mm_fpclass_sh_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_fpclass_sh_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm512_cvt_roundpd_ph(A, R) \
- ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
- (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \
- ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \
- ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \
- (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
- (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
- (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph512_mask(
- (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_pd(A, R) \
- ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
- (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \
- ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_pd(U, A, R) \
- ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \
- (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) {
- return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
- (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) {
- return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
- (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512d __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
- return (__m512d)__builtin_ia32_vcvtph2pd512_mask(
- (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_ss(A, B, R) \
- ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
- (__v4sf)_mm_undefined_ps(), \
- (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \
- ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \
- (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \
- ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A,
- __m128h __B) {
- return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
- (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W,
- __mmask8 __U,
- __m128 __A,
- __m128h __B) {
- return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B,
- (__v4sf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U,
- __m128 __A,
- __m128h __B) {
- return (__m128)__builtin_ia32_vcvtsh2ss_round_mask(
- (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundss_sh(A, B, R) \
- ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
- (__v8hf)_mm_undefined_ph(), \
- (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \
- (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \
- (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A,
- __m128 __B) {
- return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
- (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128 __B) {
- return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
- (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U,
- __m128h __A,
- __m128 __B) {
- return (__m128h)__builtin_ia32_vcvtss2sh_round_mask(
- (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsd_sh(A, B, R) \
- ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
- (__v8hf)_mm_undefined_ph(), \
- (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \
- (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \
- ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \
- (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A,
- __m128d __B) {
- return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
- (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128d __B) {
- return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
- (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) {
- return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask(
- (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_sd(A, B, R) \
- ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
- (__v2df)_mm_undefined_pd(), \
- (__mmask8)(-1), (int)(R)))
-
-#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \
- ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \
- (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \
- ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A,
- __m128h __B) {
- return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
- (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W,
- __mmask8 __U,
- __m128d __A,
- __m128h __B) {
- return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
- (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) {
- return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask(
- (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epi16(A, R) \
- ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
- (__v32hi)_mm512_undefined_epi32(), \
- (__mmask32)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \
- (__v32hi)_mm512_setzero_epi32(), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epi16(__m512h __A) {
- return (__m512i)__builtin_ia32_vcvtph2w512_mask(
- (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvtph2w512_mask(
- (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvtph2w512_mask(
- (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epi16(A, R) \
- ((__m512i)__builtin_ia32_vcvttph2w512_mask( \
- (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \
- (__v32hi)_mm512_setzero_epi32(), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epi16(__m512h __A) {
- return (__m512i)__builtin_ia32_vcvttph2w512_mask(
- (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvttph2w512_mask(
- (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvttph2w512_mask(
- (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi16_ph(A, R) \
- ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \
- (__v32hf)_mm512_undefined_ph(), \
- (__mmask32)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \
- ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \
- ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \
- (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_cvtepi16_ph(__m512i __A) {
- return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
- (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
- return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
- (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) {
- return (__m512h)__builtin_ia32_vcvtw2ph512_mask(
- (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epu16(A, R) \
- ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \
- (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \
- (__v32hu)_mm512_setzero_epi32(), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epu16(__m512h __A) {
- return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
- (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
- (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvtph2uw512_mask(
- (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epu16(A, R) \
- ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \
- (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \
- (__v32hu)_mm512_setzero_epi32(), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epu16(__m512h __A) {
- return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
- (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
- (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) {
- return (__m512i)__builtin_ia32_vcvttph2uw512_mask(
- (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu16_ph(A, R) \
- ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \
- (__v32hf)_mm512_undefined_ph(), \
- (__mmask32)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \
- ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \
- ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \
- (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_cvtepu16_ph(__m512i __A) {
- return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
- (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) {
- return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
- (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) {
- return (__m512h)__builtin_ia32_vcvtuw2ph512_mask(
- (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epi32(A, R) \
- ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \
- (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \
- (__v16si)_mm512_setzero_epi32(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epi32(__m256h __A) {
- return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
- (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
- (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvtph2dq512_mask(
- (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epu32(A, R) \
- ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \
- (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \
- (__v16su)_mm512_setzero_epi32(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epu32(__m256h __A) {
- return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
- (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
- (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvtph2udq512_mask(
- (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi32_ph(A, R) \
- ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \
- (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \
- ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \
- ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \
- (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_cvtepi32_ph(__m512i __A) {
- return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
- (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
- return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
- (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) {
- return (__m256h)__builtin_ia32_vcvtdq2ph512_mask(
- (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu32_ph(A, R) \
- ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \
- (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \
- ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \
- ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \
- (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_cvtepu32_ph(__m512i __A) {
- return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
- (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) {
- return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
- (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) {
- return (__m256h)__builtin_ia32_vcvtudq2ph512_mask(
- (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epi32(A, R) \
- ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \
- (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \
- (__v16si)_mm512_setzero_epi32(), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epi32(__m256h __A) {
- return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
- (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
- (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvttph2dq512_mask(
- (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epu32(A, R) \
- ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
- (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \
- (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \
- (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epu32(__m256h __A) {
- return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
- (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
- (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) {
- return (__m512i)__builtin_ia32_vcvttph2udq512_mask(
- (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepi64_ph(A, R) \
- ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
- (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \
- ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \
- ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \
- (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_cvtepi64_ph(__m512i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
- (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
- (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph512_mask(
- (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epi64(A, R) \
- ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \
- (__v8di)_mm512_undefined_epi32(), \
- (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \
- (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epi64(__m128h __A) {
- return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
- (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
- (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvtph2qq512_mask(
- (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundepu64_ph(A, R) \
- ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
- (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R)))
-
-#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \
- ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \
- ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \
- (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_cvtepu64_ph(__m512i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
- (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
- (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask(
- (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvt_roundph_epu64(A, R) \
- ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
- (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \
- ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \
- (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvtph_epu64(__m128h __A) {
- return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
- (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
- (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvtph2uqq512_mask(
- (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epi64(A, R) \
- ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
- (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \
- (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epi64(__m128h __A) {
- return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
- (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
- (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvttph2qq512_mask(
- (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtt_roundph_epu64(A, R) \
- ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
- (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \
- (int)(R)))
-
-#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \
- ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \
- (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_cvttph_epu64(__m128h __A) {
- return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
- (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
- (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
- return (__m512i)__builtin_ia32_vcvttph2uqq512_mask(
- (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_i32(A, R) \
- ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) {
- return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_u32(A, R) \
- ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS128
-_mm_cvtsh_u32(__m128h __A) {
- return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundsh_i64(A, R) \
- ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) {
- return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_cvt_roundsh_u64(A, R) \
- ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvtsh_u64(__m128h __A) {
- return (unsigned long long)__builtin_ia32_vcvtsh2usi64(
- (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
-}
-#endif // __x86_64__
-
-#define _mm_cvt_roundu32_sh(A, B, R) \
- ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_cvtu32_sh(__m128h __A, unsigned int __B) {
- __A[0] = __B;
- return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundu64_sh(A, B, R) \
- ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \
- (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_cvtu64_sh(__m128h __A, unsigned long long __B) {
- __A[0] = __B;
- return __A;
-}
-#endif
-
-#define _mm_cvt_roundi32_sh(A, B, R) \
- ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A,
- int __B) {
- __A[0] = __B;
- return __A;
-}
-
-#ifdef __x86_64__
-#define _mm_cvt_roundi64_sh(A, B, R) \
- ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A,
- long long __B) {
- __A[0] = __B;
- return __A;
-}
-#endif
-
-#define _mm_cvtt_roundsh_i32(A, R) \
- ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R)))
-
-static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) {
- return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsh_i64(A, R) \
- ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R)))
-
-static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) {
- return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm_cvtt_roundsh_u32(A, R) \
- ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS128
-_mm_cvttsh_u32(__m128h __A) {
- return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#ifdef __x86_64__
-#define _mm_cvtt_roundsh_u64(A, R) \
- ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS128
-_mm_cvttsh_u64(__m128h __A) {
- return (unsigned long long)__builtin_ia32_vcvttsh2usi64(
- (__v8hf)__A, _MM_FROUND_CUR_DIRECTION);
-}
-#endif
-
-#define _mm512_cvtx_roundph_ps(A, R) \
- ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \
- (__v16sf)_mm512_undefined_ps(), \
- (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \
- ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \
- ((__m512)__builtin_ia32_vcvtph2psx512_mask( \
- (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) {
- return (__m512)__builtin_ia32_vcvtph2psx512_mask(
- (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) {
- return (__m512)__builtin_ia32_vcvtph2psx512_mask(
- (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512 __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) {
- return (__m512)__builtin_ia32_vcvtph2psx512_mask(
- (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_cvtx_roundps_ph(A, R) \
- ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \
- (__v16hf)_mm256_undefined_ph(), \
- (__mmask16)(-1), (int)(R)))
-
-#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \
- ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \
- ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \
- (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) {
- return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
- (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) {
- return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
- (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS512
-_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) {
- return (__m256h)__builtin_ia32_vcvtps2phx512_mask(
- (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmadd_round_ph(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_fmsub_round_ph(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_fnmadd_round_ph(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask3( \
- -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
- -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_fnmsub_round_ph(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_maskz( \
- -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A,
- __m512h __B,
- __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A,
- __m512h __B,
- __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
- -(__v32hf)__C, (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B,
- -(__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_maskz(
- (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A,
- __m512h __B,
- __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
- (__v32hf)__C, (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A,
- __m512h __B,
- __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
- -(__v32hf)__C, (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_maskz(
- -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmaddsub_round_ph(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_fmsubadd_round_ph(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)-1, (int)(R)))
-
-#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
- (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
- (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
- return (__m512h)__builtin_ia32_vfmaddsubph512_mask3(
- (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
- (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
- (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddsubph512_mask(
- (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddsubph512_maskz(
- (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
- return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \
- (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
- return (__m512h)__builtin_ia32_vfmsubaddph512_mask3(
- (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddph512_mask( \
- (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmsubph512_mask3( \
- -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \
- (__mmask32)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B,
- -(__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) {
- return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B,
- (__v32hf)__C, (__mmask32)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W,
- __m128h __A,
- __m128h __B) {
- return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
- (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B,
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_sh(A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
- return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
- (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
- -(__v8hf)__B, (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A,
- -(__v8hf)__B, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmsub_round_sh(A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B,
- -(__v8hf)__C, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
- (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
- (__mmask8)(U), (int)R))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
- return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \
- ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
- (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W,
- __m128h __A,
- __m128h __B) {
- return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
- (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
- return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B,
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmadd_round_sh(A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
- (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
- return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \
- (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W,
- __m128h __A,
- __m128h __B) {
- return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
- (__mmask8)-1, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
- return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B,
- (__mmask8)__U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fnmsub_round_sh(A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_mask( \
- (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \
- (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) {
- return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y,
- (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \
- ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \
- (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask(
- (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fcmadd_round_sch(A, B, C, R) \
- ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \
- ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \
- ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \
- ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddcsh_round_mask(
- (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmadd_round_sch(A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \
- ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \
- ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \
- (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfcmulcsh_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_vfcmulcsh_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fcmul_round_sch(A, B, R) \
- ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
- (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fcmul_round_sch(U, A, B, R) \
- ((__m128h)__builtin_ia32_vfcmulcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
- (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfmulcsh_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__W, (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_vfmulcsh_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm_fmul_round_sch(A, B, R) \
- ((__m128h)__builtin_ia32_vfmulcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
- (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R)))
-
-#define _mm_mask_fmul_round_sch(W, U, A, B, R) \
- ((__m128h)__builtin_ia32_vfmulcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \
- (__mmask8)(U), (int)(R)))
-
-#define _mm_maskz_fmul_round_sch(U, A, B, R) \
- ((__m128h)__builtin_ia32_vfmulcsh_mask( \
- (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \
- (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A,
- __m512h __B) {
- return (__m512h)__builtin_ia32_vfcmulcph512_mask(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__W, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_vfcmulcph512_mask(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fcmul_round_pch(A, B, R) \
- ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
- (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \
- ((__m512h)__builtin_ia32_vfcmulcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
- (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A,
- __m512h __B) {
- return (__m512h)__builtin_ia32_vfmulcph512_mask(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__W, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) {
- return (__m512h)__builtin_ia32_vfmulcph512_mask(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmul_round_pch(A, B, R) \
- ((__m512h)__builtin_ia32_vfmulcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
- (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \
- ((__m512h)__builtin_ia32_vfmulcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fmul_round_pch(U, A, B, R) \
- ((__m512h)__builtin_ia32_vfmulcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \
- (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A,
- __m512h __B,
- __m512h __C) {
- return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfcmaddcph512_mask(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
- return (__m512h)__builtin_ia32_vfcmaddcph512_mask3(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfcmaddcph512_maskz(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fcmadd_round_pch(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A,
- __m512h __B,
- __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__C, (__mmask16)-1,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B,
- (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) {
- return (__m512h)__builtin_ia32_vfmaddcph512_mask3(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) {
- return (__m512h)__builtin_ia32_vfmaddcph512_maskz(
- (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U,
- _MM_FROUND_CUR_DIRECTION);
-}
-
-#define _mm512_fmadd_round_pch(A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)-1, (int)(R)))
-
-#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddcph512_mask( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \
- ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)))
-
-#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \
- ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \
- (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \
- (__mmask16)(U), (int)(R)))
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_add_ph(__m512h __W) {
- return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_mul_ph(__m512h __W) {
- return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_max_ph(__m512h __V) {
- return __builtin_ia32_reduce_fmax_ph512(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS512
-_mm512_reduce_min_ph(__m512h __V) {
- return __builtin_ia32_reduce_fmin_ph512(__V);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) {
- return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W,
- (__v32hf)__A);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) {
- return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I,
- (__v32hi)__B);
-}
-
-static __inline__ __m512h __DEFAULT_FN_ATTRS512
-_mm512_permutexvar_ph(__m512i __A, __m512h __B) {
- return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A);
-}
-
-// intrinsics below are alias for f*mul_*ch
-#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B)
-#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B)
-#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B)
-#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R)
-#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
- _mm512_mask_fmul_round_pch(W, U, A, B, R)
-#define _mm512_maskz_mul_round_pch(U, A, B, R) \
- _mm512_maskz_fmul_round_pch(U, A, B, R)
-
-#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B)
-#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B)
-#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B)
-#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R)
-#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
- _mm512_mask_fcmul_round_pch(W, U, A, B, R)
-#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
- _mm512_maskz_fcmul_round_pch(U, A, B, R)
-
-#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B)
-#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B)
-#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B)
-#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R)
-#define _mm_mask_mul_round_sch(W, U, A, B, R) \
- _mm_mask_fmul_round_sch(W, U, A, B, R)
-#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R)
-
-#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B)
-#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B)
-#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B)
-#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R)
-#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
- _mm_mask_fcmul_round_sch(W, U, A, B, R)
-#define _mm_maskz_cmul_round_sch(U, A, B, R) \
- _mm_maskz_fcmul_round_sch(U, A, B, R)
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-#undef __DEFAULT_FN_ATTRS512
-
-#endif
+++ /dev/null
-/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __IFMAINTRIN_H
-#define __IFMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y,
- (__v8di) __Z);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y,
- (__v8di) __Z);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y),
- (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
-{
- return (__m512i)__builtin_ia32_selectq_512(__M,
- (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z),
- (__v8di)_mm512_setzero_si512());
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __IFMAVLINTRIN_H
-#define __IFMAVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
-
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
- return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
- (__v2di) __Z);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128(__M,
- (__v2di)_mm_madd52hi_epu64(__W, __X, __Y),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
- return (__m128i)__builtin_ia32_selectq_128(__M,
- (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
- return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
- (__v4di)__Z);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256(__M,
- (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
- return (__m256i)__builtin_ia32_selectq_256(__M,
- (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
- return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
- (__v2di)__Z);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128(__M,
- (__v2di)_mm_madd52lo_epu64(__W, __X, __Y),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
-{
- return (__m128i)__builtin_ia32_selectq_128(__M,
- (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
- return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
- (__v4di)__Z);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256(__M,
- (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
-{
- return (__m256i)__builtin_ia32_selectq_256(__M,
- (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
- (__v4di)_mm256_setzero_si256());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===------------- avx512pfintrin.h - PF intrinsics ------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512PFINTRIN_H
-#define __AVX512PFINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf")))
-
-#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \
- __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
- (void const *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \
- __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \
- (void const *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \
- __builtin_ia32_gatherpfdps((__mmask16)(mask), \
- (__v16si)(__m512i)(index), (void const *)(addr), \
- (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \
- __builtin_ia32_gatherpfdps((__mmask16) -1, \
- (__v16si)(__m512i)(index), (void const *)(addr), \
- (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \
- __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
- (void const *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \
- __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \
- (void const *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \
- __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
- (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \
- __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \
- (void const *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \
- __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \
- (void *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \
- __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \
- (void *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \
- __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \
- (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \
- __builtin_ia32_scatterpfdps((__mmask16)(mask), \
- (__v16si)(__m512i)(index), (void *)(addr), \
- (int)(scale), (int)(hint))
-
-#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \
- __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \
- (void *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \
- __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \
- (void *)(addr), (int)(scale), \
- (int)(hint))
-
-#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \
- __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \
- (void *)(addr), (int)(scale), (int)(hint))
-
-#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \
- __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \
- (void *)(addr), (int)(scale), (int)(hint))
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VBMI2INTRIN_H
-#define __AVX512VBMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
- (__v32hi) __S,
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D,
- (__v32hi) _mm512_setzero_si512(),
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
- (__v64qi) __S,
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D,
- (__v64qi) _mm512_setzero_si512(),
- __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D)
-{
- __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D,
- __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D)
-{
- __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D,
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
- (__v32hi) __S,
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D,
- (__v32hi) _mm512_setzero_si512(),
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
- (__v64qi) __S,
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D)
-{
- return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D,
- (__v64qi) _mm512_setzero_si512(),
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
- (__v32hi) __S,
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P,
- (__v32hi) _mm512_setzero_si512(),
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
- (__v64qi) __S,
- __U);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P)
-{
- return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P,
- (__v64qi) _mm512_setzero_si512(),
- __U);
-}
-
-#define _mm512_shldi_epi64(A, B, I) \
- ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi64(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
- (__v8di)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi64(U, A, B, I) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_shldi_epi64((A), (B), (I)), \
- (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shldi_epi32(A, B, I) \
- ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi32(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
- (__v16si)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi32(U, A, B, I) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shldi_epi32((A), (B), (I)), \
- (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shldi_epi16(A, B, I) \
- ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \
- (__v32hi)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shldi_epi16(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
- (__v32hi)(__m512i)(S)))
-
-#define _mm512_maskz_shldi_epi16(U, A, B, I) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \
- (__v32hi)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi64(A, B, I) \
- ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
- (__v8di)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi64(U, A, B, I) \
- ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
- (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \
- (__v8di)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi32(A, B, I) \
- ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \
- (__v16si)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
- (__v16si)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi32(U, A, B, I) \
- ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
- (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \
- (__v16si)_mm512_setzero_si512()))
-
-#define _mm512_shrdi_epi16(A, B, I) \
- ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \
- (__v32hi)(__m512i)(B), (int)(I)))
-
-#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
- (__v32hi)(__m512i)(S)))
-
-#define _mm512_maskz_shrdi_epi16(U, A, B, I) \
- ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \
- (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \
- (__v32hi)_mm512_setzero_si512()))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B,
- (__v8di)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_shldv_epi64(__A, __B, __C),
- (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_shldv_epi64(__A, __B, __C),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_shldv_epi32(__A, __B, __C),
- (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_shldv_epi32(__A, __B, __C),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B,
- (__v32hi)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectw_512(__U,
- (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
- (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectw_512(__U,
- (__v32hi)_mm512_shldv_epi16(__A, __B, __C),
- (__v32hi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B,
- (__v8di)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
- (__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectq_512(__U,
- (__v8di)_mm512_shrdv_epi64(__A, __B, __C),
- (__v8di)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B,
- (__v16si)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C)
-{
- return (__m512i) __builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
- (__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i) __builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_shrdv_epi32(__A, __B, __C),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B,
- (__v32hi)__C);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectw_512(__U,
- (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
- (__v32hi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C)
-{
- return (__m512i)__builtin_ia32_selectw_512(__U,
- (__v32hi)_mm512_shrdv_epi16(__A, __B, __C),
- (__v32hi)_mm512_setzero_si512());
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
-
+++ /dev/null
-/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VBMIINTRIN_H
-#define __VBMIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I,
- (__v64qi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512(__U,
- (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
- (__v64qi)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512(__U,
- (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
- (__v64qi)__I);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512(__U,
- (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_permutexvar_epi8(__A, __B),
- (__v64qi)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
- __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_permutexvar_epi8(__A, __B),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X,
- __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
- (__v64qi)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y)
-{
- return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M,
- (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y),
- (__v64qi)_mm512_setzero_si512());
-}
-
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VBMIVLINTRIN_H
-#define __VBMIVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256)))
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A,
- (__v16qi)__I,
- (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128(__U,
- (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
- (__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128(__U,
- (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
- (__v16qi)__I);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128(__U,
- (__v16qi)_mm_permutex2var_epi8(__A, __I, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I,
- (__v32qi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256(__U,
- (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
- (__v32qi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256(__U,
- (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
- (__v32qi)__I);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256(__U,
- (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_permutexvar_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_permutexvar_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_permutexvar_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_permutexvar_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X,
- __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_multishift_epi64_epi8(__X, __Y),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X,
- __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y),
- (__v32qi)_mm256_setzero_si256());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLBF16INTRIN_H
-#define __AVX512VLBF16INTRIN_H
-
-#if (__clang_major__ <= 15)
-typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
-#endif
-
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [4 x float].
-/// \param __B
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-/// conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
- return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
- (__v4sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [4 x float].
-/// \param __B
-/// A 128-bit vector of [4 x float].
-/// \param __W
-/// A 128-bit vector of [8 x bfloat].
-/// \param __U
-/// A 8-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-/// conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
- (__v8hi)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [4 x float].
-/// \param __B
-/// A 128-bit vector of [4 x float].
-/// \param __U
-/// A 8-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-/// conversion of __B, and higher 64 bits come from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-/// conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
- return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
- (__v8sf) __B);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __W
-/// A 256-bit vector of [16 x bfloat].
-/// \param __U
-/// A 16-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A or __B. A 0 means element from __W.
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-/// conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
- return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
- (__v16hi)__W);
-}
-
-/// Convert Two Packed Single Data to One Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __B
-/// A 256-bit vector of [8 x float].
-/// \param __U
-/// A 16-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A or __B. A 0 means element is zero.
-/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
-/// conversion of __B, and higher 128 bits come from conversion of __A.
-static __inline__ __m256bh __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
- return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-/// conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtneps_pbh(__m128 __A) {
- return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8)-1);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [4 x float].
-/// \param __W
-/// A 128-bit vector of [8 x bfloat].
-/// \param __U
-/// A 4-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-/// conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
- return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
- (__v8hi)__W,
- (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [4 x float].
-/// \param __U
-/// A 4-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
-/// conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
- return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
- (__v8hi)_mm_setzero_si128(),
- (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_cvtneps_pbh(__m256 __A) {
- return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8)-1);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __W
-/// A 256-bit vector of [8 x bfloat].
-/// \param __U
-/// A 8-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A. A 0 means element from __W.
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
- return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
- (__v8hi)__W,
- (__mmask8)__U);
-}
-
-/// Convert Packed Single Data to Packed BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [8 x float].
-/// \param __U
-/// A 8-bit mask value specifying what is chosen for each element.
-/// A 1 means conversion of __A. A 0 means element is zero.
-/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
- return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
- (__v8hi)_mm_setzero_si128(),
- (__mmask8)__U);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [8 x bfloat].
-/// \param __B
-/// A 128-bit vector of [8 x bfloat].
-/// \param __D
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
- return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
- (__v4si)__A,
- (__v4si)__B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [8 x bfloat].
-/// \param __B
-/// A 128-bit vector of [8 x bfloat].
-/// \param __D
-/// A 128-bit vector of [4 x float].
-/// \param __U
-/// A 8-bit mask value specifying what is chosen for each element.
-/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
- (__v4sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 128-bit vector of [8 x bfloat].
-/// \param __B
-/// A 128-bit vector of [8 x bfloat].
-/// \param __D
-/// A 128-bit vector of [4 x float].
-/// \param __U
-/// A 8-bit mask value specifying what is chosen for each element.
-/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
- (__v4sf)_mm_setzero_si128());
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [16 x bfloat].
-/// \param __B
-/// A 256-bit vector of [16 x bfloat].
-/// \param __D
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
- return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
- (__v8si)__A,
- (__v8si)__B);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [16 x bfloat].
-/// \param __B
-/// A 256-bit vector of [16 x bfloat].
-/// \param __D
-/// A 256-bit vector of [8 x float].
-/// \param __U
-/// A 16-bit mask value specifying what is chosen for each element.
-/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
-/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
- (__v8sf)__D);
-}
-
-/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
-///
-/// \param __A
-/// A 256-bit vector of [16 x bfloat].
-/// \param __B
-/// A 256-bit vector of [16 x bfloat].
-/// \param __D
-/// A 256-bit vector of [8 x float].
-/// \param __U
-/// A 8-bit mask value specifying what is chosen for each element.
-/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
-/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
-/// __A, __B and __D
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
- (__v8sf)_mm256_setzero_si256());
-}
-
-/// Convert One Single float Data to One BF16 Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
-///
-/// \param __A
-/// A float data.
-/// \returns A bf16 data whose sign field and exponent field keep unchanged,
-/// and fraction field is truncated to 7 bits.
-static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
- __v4sf __V = {__A, 0, 0, 0};
-#if (__clang_major__ > 15)
- __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
- (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
- return (__bf16)__R[0];
-#else
- __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
- (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
- return __R[0];
-#endif
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-/// A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
- return _mm_castsi128_ps(
- (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __A
-/// A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-/// A 4-bit mask. Elements are zeroed out when the corresponding mask
-/// bit is not set.
-/// \param __A
-/// A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
- (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __U
-/// A 8-bit mask. Elements are zeroed out when the corresponding mask
-/// bit is not set.
-/// \param __A
-/// A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
- (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-/// A 128-bit vector of [4 x float]. Elements are copied from __S when
-/// the corresponding mask bit is not set.
-/// \param __U
-/// A 4-bit mask. Elements are zeroed out when the corresponding mask
-/// bit is not set.
-/// \param __A
-/// A 128-bit vector of [4 x bfloat].
-/// \returns A 128-bit vector of [4 x float] come from conversion of __A
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
- return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
- (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
- 16));
-}
-
-/// Convert Packed BF16 Data to Packed float Data using merging mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \param __S
-/// A 256-bit vector of [8 x float]. Elements are copied from __S when
-/// the corresponding mask bit is not set.
-/// \param __U
-/// A 8-bit mask. Elements are zeroed out when the corresponding mask
-/// bit is not set.
-/// \param __A
-/// A 128-bit vector of [8 x bfloat].
-/// \returns A 256-bit vector of [8 x float] come from conversion of __A
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
- return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
- (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
- 16));
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbitalgintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLBITALGINTRIN_H
-#define __AVX512VLBITALGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi16(__m256i __A)
-{
- return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B)
-{
- return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U,
- (__v16hi) _mm256_popcnt_epi16(__B),
- (__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B)
-{
- return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(),
- __U,
- __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi16(__m128i __A)
-{
- return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B)
-{
- return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U,
- (__v8hi) _mm_popcnt_epi16(__B),
- (__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B)
-{
- return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(),
- __U,
- __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi8(__m256i __A)
-{
- return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B)
-{
- return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U,
- (__v32qi) _mm256_popcnt_epi8(__B),
- (__v32qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B)
-{
- return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(),
- __U,
- __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi8(__m128i __A)
-{
- return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B)
-{
- return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U,
- (__v16qi) _mm_popcnt_epi8(__B),
- (__v16qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B)
-{
- return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(),
- __U,
- __B);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A,
- (__v32qi) __B,
- __U);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B)
-{
- return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1,
- __A,
- __B);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A,
- (__v16qi) __B,
- __U);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
-{
- return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1,
- __A,
- __B);
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLBWINTRIN_H
-#define __AVX512VLBWINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256)))
-
-/* Integer compare */
-
-#define _mm_cmp_epi8_mask(a, b, p) \
- ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
- (__v16qi)(__m128i)(b), (int)(p), \
- (__mmask16)-1))
-
-#define _mm_mask_cmp_epi8_mask(m, a, b, p) \
- ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \
- (__v16qi)(__m128i)(b), (int)(p), \
- (__mmask16)(m)))
-
-#define _mm_cmp_epu8_mask(a, b, p) \
- ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
- (__v16qi)(__m128i)(b), (int)(p), \
- (__mmask16)-1))
-
-#define _mm_mask_cmp_epu8_mask(m, a, b, p) \
- ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \
- (__v16qi)(__m128i)(b), (int)(p), \
- (__mmask16)(m)))
-
-#define _mm256_cmp_epi8_mask(a, b, p) \
- ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
- (__v32qi)(__m256i)(b), (int)(p), \
- (__mmask32)-1))
-
-#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \
- ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \
- (__v32qi)(__m256i)(b), (int)(p), \
- (__mmask32)(m)))
-
-#define _mm256_cmp_epu8_mask(a, b, p) \
- ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
- (__v32qi)(__m256i)(b), (int)(p), \
- (__mmask32)-1))
-
-#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \
- ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \
- (__v32qi)(__m256i)(b), (int)(p), \
- (__mmask32)(m)))
-
-#define _mm_cmp_epi16_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
- (__v8hi)(__m128i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_epi16_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \
- (__v8hi)(__m128i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm_cmp_epu16_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
- (__v8hi)(__m128i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_epu16_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \
- (__v8hi)(__m128i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm256_cmp_epi16_mask(a, b, p) \
- ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
- (__v16hi)(__m256i)(b), (int)(p), \
- (__mmask16)-1))
-
-#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \
- ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \
- (__v16hi)(__m256i)(b), (int)(p), \
- (__mmask16)(m)))
-
-#define _mm256_cmp_epu16_mask(a, b, p) \
- ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
- (__v16hi)(__m256i)(b), (int)(p), \
- (__mmask16)-1))
-
-#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \
- ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \
- (__v16hi)(__m256i)(b), (int)(p), \
- (__mmask16)(m)))
-
-#define _mm_cmpeq_epi8_mask(A, B) \
- _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi8_mask(k, A, B) \
- _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi8_mask(A, B) \
- _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi8_mask(k, A, B) \
- _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi8_mask(A, B) \
- _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi8_mask(k, A, B) \
- _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi8_mask(A, B) \
- _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi8_mask(k, A, B) \
- _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi8_mask(A, B) \
- _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi8_mask(k, A, B) \
- _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi8_mask(A, B) \
- _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi8_mask(k, A, B) \
- _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi8_mask(A, B) \
- _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \
- _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi8_mask(A, B) \
- _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi8_mask(k, A, B) \
- _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi8_mask(A, B) \
- _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \
- _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi8_mask(A, B) \
- _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi8_mask(k, A, B) \
- _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi8_mask(A, B) \
- _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi8_mask(k, A, B) \
- _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi8_mask(A, B) \
- _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \
- _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu8_mask(A, B) \
- _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu8_mask(k, A, B) \
- _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu8_mask(A, B) \
- _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu8_mask(k, A, B) \
- _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu8_mask(A, B) \
- _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu8_mask(k, A, B) \
- _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu8_mask(A, B) \
- _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu8_mask(k, A, B) \
- _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu8_mask(A, B) \
- _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu8_mask(k, A, B) \
- _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu8_mask(A, B) \
- _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu8_mask(k, A, B) \
- _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu8_mask(A, B) \
- _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \
- _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu8_mask(A, B) \
- _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu8_mask(k, A, B) \
- _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu8_mask(A, B) \
- _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \
- _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu8_mask(A, B) \
- _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu8_mask(k, A, B) \
- _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu8_mask(A, B) \
- _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu8_mask(k, A, B) \
- _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu8_mask(A, B) \
- _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \
- _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epi16_mask(A, B) \
- _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi16_mask(k, A, B) \
- _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi16_mask(A, B) \
- _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi16_mask(k, A, B) \
- _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi16_mask(A, B) \
- _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi16_mask(k, A, B) \
- _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi16_mask(A, B) \
- _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi16_mask(k, A, B) \
- _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi16_mask(A, B) \
- _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi16_mask(k, A, B) \
- _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi16_mask(A, B) \
- _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi16_mask(k, A, B) \
- _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi16_mask(A, B) \
- _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \
- _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi16_mask(A, B) \
- _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi16_mask(k, A, B) \
- _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi16_mask(A, B) \
- _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \
- _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi16_mask(A, B) \
- _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi16_mask(k, A, B) \
- _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi16_mask(A, B) \
- _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi16_mask(k, A, B) \
- _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi16_mask(A, B) \
- _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \
- _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu16_mask(A, B) \
- _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu16_mask(k, A, B) \
- _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu16_mask(A, B) \
- _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu16_mask(k, A, B) \
- _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu16_mask(A, B) \
- _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu16_mask(k, A, B) \
- _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu16_mask(A, B) \
- _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu16_mask(k, A, B) \
- _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu16_mask(A, B) \
- _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu16_mask(k, A, B) \
- _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu16_mask(A, B) \
- _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu16_mask(k, A, B) \
- _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu16_mask(A, B) \
- _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \
- _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu16_mask(A, B) \
- _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu16_mask(k, A, B) \
- _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu16_mask(A, B) \
- _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \
- _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu16_mask(A, B) \
- _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu16_mask(k, A, B) \
- _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu16_mask(A, B) \
- _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu16_mask(k, A, B) \
- _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu16_mask(A, B) \
- _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \
- _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_add_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_add_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_add_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_add_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_sub_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_sub_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sub_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sub_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_add_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_add_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_add_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_add_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_sub_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_sub_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sub_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sub_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mullo_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mullo_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mullo_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mullo_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
-{
- return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
- (__v16qi) __W,
- (__v16qi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
-{
- return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
- (__v32qi) __W,
- (__v32qi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
-{
- return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
- (__v8hi) __W,
- (__v8hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
-{
- return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
- (__v16hi) __W,
- (__v16hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_abs_epi8(__A),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_abs_epi8(__A),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_abs_epi8(__A),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_abs_epi8(__A),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_abs_epi16(__A),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_abs_epi16(__A),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_abs_epi16(__A),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_abs_epi16(__A),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_packs_epi32(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_packs_epi32(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_packs_epi32(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_packs_epi32(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_packs_epi16(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_packs_epi16(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_packs_epi16(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_packs_epi16(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_packus_epi32(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_packus_epi32(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_packus_epi32(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_packus_epi32(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_packus_epi16(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_packus_epi16(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_packus_epi16(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_packus_epi16(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_adds_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_adds_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_adds_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_adds_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_adds_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_adds_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_adds_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_adds_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_adds_epu8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_adds_epu8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_adds_epu8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_adds_epu8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_adds_epu16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_adds_epu16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_adds_epu16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_adds_epu16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_avg_epu8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_avg_epu8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_avg_epu8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_avg_epu8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_avg_epu16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_avg_epu16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_avg_epu16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_avg_epu16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_max_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_max_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_max_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_max_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_max_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_max_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_max_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_max_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_max_epu8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_max_epu8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_max_epu8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_max_epu8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_max_epu16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_max_epu16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_max_epu16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_max_epu16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_min_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_min_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_min_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_min_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_min_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_min_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_min_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_min_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_min_epu8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm_min_epu8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_min_epu8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M,
- (__v32qi)_mm256_min_epu8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_min_epu16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_min_epu16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_min_epu16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_min_epu16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_shuffle_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_shuffle_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_shuffle_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_shuffle_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_subs_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_subs_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_subs_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_subs_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_subs_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_subs_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_subs_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_subs_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_subs_epu8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_subs_epu8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_subs_epu8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_subs_epu8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_subs_epu16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_subs_epu16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A,
- __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_subs_epu16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_subs_epu16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
- (__v8hi) __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128(__U,
- (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
- (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128(__U,
- (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
- (__v8hi)__I);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128(__U,
- (__v8hi)_mm_permutex2var_epi16(__A, __I, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
- (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256(__U,
- (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
- (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256(__U,
- (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
- (__v16hi)__I);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256(__U,
- (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_maddubs_epi16(__X, __Y),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_maddubs_epi16(__X, __Y),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X,
- __m256i __Y) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_maddubs_epi16(__X, __Y),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_maddubs_epi16(__X, __Y),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_madd_epi16(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_madd_epi16(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_madd_epi16(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_madd_epi16(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi16_epi8 (__m128i __A) {
- return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
- (__v16qi) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
- return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) {
- return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
- (__v16qi) _mm_setzero_si128(),
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi16_epi8 (__m256i __A) {
- return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
- (__v16qi) _mm_setzero_si128(),
- (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
- return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) {
- return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
- (__v16qi) _mm_setzero_si128(),
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi16_epi8 (__m128i __A) {
- return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
- (__v16qi) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
- return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) {
- return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
- (__v16qi) _mm_setzero_si128(),
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi16_epi8 (__m256i __A) {
- return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
- (__v16qi) _mm_setzero_si128(),
- (__mmask16) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
- return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) {
- return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
- (__v16qi) _mm_setzero_si128(),
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi16_epi8 (__m128i __A) {
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v8hi)__A, __v8qi),
- (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 12, 13, 14, 15);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) {
- return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) {
- return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
- (__v16qi) _mm_setzero_si128(),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
-}
-
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_epi8 (__m256i __A) {
- return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm256_cvtepi16_epi8(__A),
- (__v16qi)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M,
- (__v16qi)_mm256_cvtepi16_epi8(__A),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
-{
- __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
-{
- __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A)
-{
- __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mulhrs_epi16(__X, __Y),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mulhrs_epi16(__X, __Y),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mulhrs_epi16(__X, __Y),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mulhi_epu16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mulhi_epu16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mulhi_epu16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mulhi_epu16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mulhi_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_mulhi_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mulhi_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_mulhi_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_unpackhi_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_unpackhi_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_unpackhi_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_unpackhi_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_unpackhi_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_unpackhi_epi16(__A, __B),
- (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_unpackhi_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_unpackhi_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_unpacklo_epi8(__A, __B),
- (__v16qi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
- (__v16qi)_mm_unpacklo_epi8(__A, __B),
- (__v16qi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_unpacklo_epi8(__A, __B),
- (__v32qi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
- (__v32qi)_mm256_unpacklo_epi8(__A, __B),
- (__v32qi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_unpacklo_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_unpacklo_epi16(__A, __B),
- (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_unpacklo_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_unpacklo_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_cvtepi8_epi16(__A),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_cvtepi8_epi16(__A),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_cvtepi8_epi16(__A),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_cvtepi8_epi16(__A),
- (__v16hi)_mm256_setzero_si256());
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_cvtepu8_epi16(__A),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_cvtepu8_epi16(__A),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_cvtepu8_epi16(__A),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_cvtepu8_epi16(__A),
- (__v16hi)_mm256_setzero_si256());
-}
-
-
-#define _mm_mask_shufflehi_epi16(W, U, A, imm) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
- (__v8hi)(__m128i)(W)))
-
-#define _mm_maskz_shufflehi_epi16(U, A, imm) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shufflehi_epi16((A), (imm)), \
- (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
- (__v16hi)(__m256i)(W)))
-
-#define _mm256_maskz_shufflehi_epi16(U, A, imm) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \
- (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_mask_shufflelo_epi16(W, U, A, imm) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
- (__v8hi)(__m128i)(W)))
-
-#define _mm_maskz_shufflelo_epi16(U, A, imm) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shufflelo_epi16((A), (imm)), \
- (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shufflelo_epi16((A), \
- (imm)), \
- (__v16hi)(__m256i)(W)))
-
-#define _mm256_maskz_shufflelo_epi16(U, A, imm) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shufflelo_epi16((A), \
- (imm)), \
- (__v16hi)_mm256_setzero_si256()))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sllv_epi16(__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sllv_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sllv_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sllv_epi16(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sllv_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sllv_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sll_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sll_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sll_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sll_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_slli_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_slli_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A,
- unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_slli_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_slli_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srlv_epi16(__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srlv_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srlv_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srlv_epi16(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srlv_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srlv_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srav_epi16(__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srav_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srav_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srav_epi16(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srav_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srav_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sra_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_sra_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sra_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_sra_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srai_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srai_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A,
- unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srai_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srai_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srl_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srl_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srl_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srl_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srli_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U,
- (__v8hi)_mm_srli_epi16(__A, __B),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srli_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U,
- (__v16hi)_mm256_srli_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
- (__v8hi) __A,
- (__v8hi) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U,
- (__v8hi) __A,
- (__v8hi) _mm_setzero_si128 ());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
- (__v16hi) __A,
- (__v16hi) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U,
- (__v16hi) __A,
- (__v16hi) _mm256_setzero_si256 ());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
- (__v16qi) __A,
- (__v16qi) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U,
- (__v16qi) __A,
- (__v16qi) _mm_setzero_si128 ());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
- (__v32qi) __A,
- (__v32qi) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U,
- (__v32qi) __A,
- (__v32qi) _mm256_setzero_si256 ());
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
-{
- return (__m128i) __builtin_ia32_selectb_128(__M,
- (__v16qi) _mm_set1_epi8(__A),
- (__v16qi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
-{
- return (__m128i) __builtin_ia32_selectb_128(__M,
- (__v16qi) _mm_set1_epi8(__A),
- (__v16qi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
-{
- return (__m256i) __builtin_ia32_selectb_256(__M,
- (__v32qi) _mm256_set1_epi8(__A),
- (__v32qi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
-{
- return (__m256i) __builtin_ia32_selectb_256(__M,
- (__v32qi) _mm256_set1_epi8(__A),
- (__v32qi) _mm256_setzero_si256());
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi16 (void const *__P)
-{
- struct __loadu_epi16 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi16*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
- (__v8hi) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P,
- (__v8hi)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi16 (void const *__P)
-{
- struct __loadu_epi16 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi16*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
- (__v16hi) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P,
- (__v16hi)
- _mm256_setzero_si256 (),
- (__mmask16) __U);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi8 (void const *__P)
-{
- struct __loadu_epi8 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi8*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
- (__v16qi) __W,
- (__mmask16) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P,
- (__v16qi)
- _mm_setzero_si128 (),
- (__mmask16) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi8 (void const *__P)
-{
- struct __loadu_epi8 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi8*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
- (__v32qi) __W,
- (__mmask32) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P,
- (__v32qi)
- _mm256_setzero_si256 (),
- (__mmask32) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi16 (void *__P, __m128i __A)
-{
- struct __storeu_epi16 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi16*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
-{
- __builtin_ia32_storedquhi128_mask ((__v8hi *) __P,
- (__v8hi) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi16 (void *__P, __m256i __A)
-{
- struct __storeu_epi16 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi16*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
-{
- __builtin_ia32_storedquhi256_mask ((__v16hi *) __P,
- (__v16hi) __A,
- (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi8 (void *__P, __m128i __A)
-{
- struct __storeu_epi8 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi8*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
-{
- __builtin_ia32_storedquqi128_mask ((__v16qi *) __P,
- (__v16qi) __A,
- (__mmask16) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi8 (void *__P, __m256i __A)
-{
- struct __storeu_epi8 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi8*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
-{
- __builtin_ia32_storedquqi256_mask ((__v32qi *) __P,
- (__v32qi) __A,
- (__mmask32) __U);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_test_epi8_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B),
- _mm_setzero_si128());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_test_epi8_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_test_epi16_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B),
- _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_test_epi16_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B),
- _mm256_setzero_si256 ());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_testn_epi8_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B),
- _mm_setzero_si128());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_testn_epi16_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS128
-_mm_movepi8_mask (__m128i __A)
-{
- return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
-}
-
-static __inline__ __mmask32 __DEFAULT_FN_ATTRS256
-_mm256_movepi8_mask (__m256i __A)
-{
- return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_movepi16_mask (__m128i __A)
-{
- return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
-}
-
-static __inline__ __mmask16 __DEFAULT_FN_ATTRS256
-_mm256_movepi16_mask (__m256i __A)
-{
- return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi8 (__mmask16 __A)
-{
- return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi8 (__mmask32 __A)
-{
- return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi16 (__mmask8 __A)
-{
- return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi16 (__mmask16 __A)
-{
- return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectb_128(__M,
- (__v16qi) _mm_broadcastb_epi8(__A),
- (__v16qi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectb_128(__M,
- (__v16qi) _mm_broadcastb_epi8(__A),
- (__v16qi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectb_256(__M,
- (__v32qi) _mm256_broadcastb_epi8(__A),
- (__v32qi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectb_256(__M,
- (__v32qi) _mm256_broadcastb_epi8(__A),
- (__v32qi) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128(__M,
- (__v8hi) _mm_broadcastw_epi16(__A),
- (__v8hi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectw_128(__M,
- (__v8hi) _mm_broadcastw_epi16(__A),
- (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256(__M,
- (__v16hi) _mm256_broadcastw_epi16(__A),
- (__v16hi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectw_256(__M,
- (__v16hi) _mm256_broadcastw_epi16(__A),
- (__v16hi) _mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
-{
- return (__m256i) __builtin_ia32_selectw_256 (__M,
- (__v16hi) _mm256_set1_epi16(__A),
- (__v16hi) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
-{
- return (__m256i) __builtin_ia32_selectw_256(__M,
- (__v16hi)_mm256_set1_epi16(__A),
- (__v16hi) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
-{
- return (__m128i) __builtin_ia32_selectw_128(__M,
- (__v8hi) _mm_set1_epi16(__A),
- (__v8hi) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
-{
- return (__m128i) __builtin_ia32_selectw_128(__M,
- (__v8hi) _mm_set1_epi16(__A),
- (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_permutexvar_epi16(__A, __B),
- (__v8hi) _mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M,
- (__v8hi)_mm_permutexvar_epi16(__A, __B),
- (__v8hi)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_permutexvar_epi16(__A, __B),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
- __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M,
- (__v16hi)_mm256_permutexvar_epi16(__A, __B),
- (__v16hi)__W);
-}
-
-#define _mm_mask_alignr_epi8(W, U, A, B, N) \
- ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
- (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
- (__v16qi)(__m128i)(W)))
-
-#define _mm_maskz_alignr_epi8(U, A, B, N) \
- ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
- (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \
- (__v16qi)_mm_setzero_si128()))
-
-#define _mm256_mask_alignr_epi8(W, U, A, B, N) \
- ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
- (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
- (__v32qi)(__m256i)(W)))
-
-#define _mm256_maskz_alignr_epi8(U, A, B, N) \
- ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
- (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \
- (__v32qi)_mm256_setzero_si256()))
-
-#define _mm_dbsad_epu8(A, B, imm) \
- ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(imm)))
-
-#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
- (__v8hi)(__m128i)(W)))
-
-#define _mm_maskz_dbsad_epu8(U, A, B, imm) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \
- (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_dbsad_epu8(A, B, imm) \
- ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \
- (__v32qi)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
- (__v16hi)(__m256i)(W)))
-
-#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
- (__v16hi)_mm256_setzero_si256()))
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __AVX512VLBWINTRIN_H */
+++ /dev/null
-/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlcdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLCDINTRIN_H
-#define __AVX512VLCDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256)))
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmb_epi64 (__mmask8 __A)
-{
- return (__m128i) _mm_set1_epi64x((long long) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmb_epi64 (__mmask8 __A)
-{
- return (__m256i) _mm256_set1_epi64x((long long)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcastmw_epi32 (__mmask16 __A)
-{
- return (__m128i) _mm_set1_epi32((int)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcastmw_epi32 (__mmask16 __A)
-{
- return (__m256i) _mm256_set1_epi32((int)__A);
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi64 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_conflict_epi64(__A),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_conflict_epi64(__A),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi64 (__m256i __A)
-{
- return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_conflict_epi64(__A),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_conflict_epi64(__A),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_conflict_epi32 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_conflict_epi32(__A),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_conflict_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_conflict_epi32 (__m256i __A)
-{
- return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_conflict_epi32(__A),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_conflict_epi32(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_lzcnt_epi32 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_lzcnt_epi32(__A),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_lzcnt_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_lzcnt_epi32 (__m256i __A)
-{
- return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_lzcnt_epi32(__A),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_lzcnt_epi32(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_lzcnt_epi64 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_lzcnt_epi64(__A),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_lzcnt_epi64(__A),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_lzcnt_epi64 (__m256i __A)
-{
- return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_lzcnt_epi64(__A),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_lzcnt_epi64(__A),
- (__v4di)_mm256_setzero_si256());
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __AVX512VLCDINTRIN_H */
+++ /dev/null
-/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLDQINTRIN_H
-#define __AVX512VLDQINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi64 (__m256i __A, __m256i __B) {
- return (__m256i) ((__v4du) __A * (__v4du) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_mullo_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_mullo_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mullo_epi64 (__m128i __A, __m128i __B) {
- return (__m128i) ((__v2du) __A * (__v2du) __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_mullo_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_mullo_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_andnot_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_andnot_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_andnot_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_andnot_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_andnot_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_andnot_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_andnot_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_andnot_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_and_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_and_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_and_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_and_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_and_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_and_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_and_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_and_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_xor_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_xor_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_xor_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_xor_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_xor_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_xor_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_xor_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_xor_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_or_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_or_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_or_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_or_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_or_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_or_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_or_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_or_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtpd_epi64 (__m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtpd_epi64 (__m256d __A) {
- return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtpd_epu64 (__m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtpd_epu64 (__m256d __A) {
- return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtps_epi64 (__m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtps_epi64 (__m128 __A) {
- return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtps_epu64 (__m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtps_epu64 (__m128 __A) {
- return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_pd (__m128i __A) {
- return (__m128d)__builtin_convertvector((__v2di)__A, __v2df);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_cvtepi64_pd(__A),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_cvtepi64_pd(__A),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_pd (__m256i __A) {
- return (__m256d)__builtin_convertvector((__v4di)__A, __v4df);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_cvtepi64_pd(__A),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_cvtepi64_pd(__A),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_ps (__m128i __A) {
- return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
- (__v4sf) _mm_setzero_ps(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
- return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) {
- return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
- (__v4sf) _mm_setzero_ps(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_ps (__m256i __A) {
- return (__m128)__builtin_convertvector((__v4di)__A, __v4sf);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm256_cvtepi64_ps(__A),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm256_cvtepi64_ps(__A),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttpd_epi64 (__m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttpd_epi64 (__m256d __A) {
- return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttpd_epu64 (__m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttpd_epu64 (__m256d __A) {
- return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) {
- return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttps_epi64 (__m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttps_epi64 (__m128 __A) {
- return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttps_epu64 (__m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
- (__v2di) _mm_setzero_si128(),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttps_epu64 (__m128 __A) {
- return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) {
- return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
- (__v4di) _mm256_setzero_si256(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtepu64_pd (__m128i __A) {
- return (__m128d)__builtin_convertvector((__v2du)__A, __v2df);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_cvtepu64_pd(__A),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_cvtepu64_pd(__A),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_cvtepu64_pd (__m256i __A) {
- return (__m256d)__builtin_convertvector((__v4du)__A, __v4df);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_cvtepu64_pd(__A),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_cvtepu64_pd(__A),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtepu64_ps (__m128i __A) {
- return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
- (__v4sf) _mm_setzero_ps(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) {
- return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) {
- return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
- (__v4sf) _mm_setzero_ps(),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_cvtepu64_ps (__m256i __A) {
- return (__m128)__builtin_convertvector((__v4du)__A, __v4sf);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm256_cvtepu64_ps(__A),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm256_cvtepu64_ps(__A),
- (__v4sf)_mm_setzero_ps());
-}
-
-#define _mm_range_pd(A, B, C) \
- ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), (int)(C), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1))
-
-#define _mm_mask_range_pd(W, U, A, B, C) \
- ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), (int)(C), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)))
-
-#define _mm_maskz_range_pd(U, A, B, C) \
- ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), (int)(C), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm256_range_pd(A, B, C) \
- ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
- (__v4df)(__m256d)(B), (int)(C), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)-1))
-
-#define _mm256_mask_range_pd(W, U, A, B, C) \
- ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
- (__v4df)(__m256d)(B), (int)(C), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_range_pd(U, A, B, C) \
- ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \
- (__v4df)(__m256d)(B), (int)(C), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm_range_ps(A, B, C) \
- ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), (int)(C), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1))
-
-#define _mm_mask_range_ps(W, U, A, B, C) \
- ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), (int)(C), \
- (__v4sf)(__m128)(W), (__mmask8)(U)))
-
-#define _mm_maskz_range_ps(U, A, B, C) \
- ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), (int)(C), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm256_range_ps(A, B, C) \
- ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
- (__v8sf)(__m256)(B), (int)(C), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1))
-
-#define _mm256_mask_range_ps(W, U, A, B, C) \
- ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
- (__v8sf)(__m256)(B), (int)(C), \
- (__v8sf)(__m256)(W), (__mmask8)(U)))
-
-#define _mm256_maskz_range_ps(U, A, B, C) \
- ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \
- (__v8sf)(__m256)(B), (int)(C), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm_reduce_pd(A, B) \
- ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1))
-
-#define _mm_mask_reduce_pd(W, U, A, B) \
- ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)))
-
-#define _mm_maskz_reduce_pd(U, A, B) \
- ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm256_reduce_pd(A, B) \
- ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)-1))
-
-#define _mm256_mask_reduce_pd(W, U, A, B) \
- ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_reduce_pd(U, A, B) \
- ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm_reduce_ps(A, B) \
- ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1))
-
-#define _mm_mask_reduce_ps(W, U, A, B) \
- ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)))
-
-#define _mm_maskz_reduce_ps(U, A, B) \
- ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm256_reduce_ps(A, B) \
- ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1))
-
-#define _mm256_mask_reduce_ps(W, U, A, B) \
- ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
- (__v8sf)(__m256)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_reduce_ps(U, A, B) \
- ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U)))
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_movepi32_mask (__m128i __A)
-{
- return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_movepi32_mask (__m256i __A)
-{
- return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi32 (__mmask8 __A)
-{
- return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi32 (__mmask8 __A)
-{
- return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_movm_epi64 (__mmask8 __A)
-{
- return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_movm_epi64 (__mmask8 __A)
-{
- return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_movepi64_mask (__m128i __A)
-{
- return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_movepi64_mask (__m256i __A)
-{
- return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f32x2 (__m128 __A)
-{
- return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
- 0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
- (__v8sf)_mm256_broadcast_f32x2(__A),
- (__v8sf)__O);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
- (__v8sf)_mm256_broadcast_f32x2(__A),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f64x2(__m128d __A)
-{
- return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A,
- 0, 1, 0, 1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
- (__v4df)_mm256_broadcast_f64x2(__A),
- (__v4df)__O);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M,
- (__v4df)_mm256_broadcast_f64x2(__A),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_broadcast_i32x2 (__m128i __A)
-{
- return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
- 0, 1, 0, 1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_broadcast_i32x2(__A),
- (__v4si)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_broadcast_i32x2(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i32x2 (__m128i __A)
-{
- return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
- 0, 1, 0, 1, 0, 1, 0, 1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_broadcast_i32x2(__A),
- (__v8si)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_broadcast_i32x2(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i64x2(__m128i __A)
-{
- return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A,
- 0, 1, 0, 1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_broadcast_i64x2(__A),
- (__v4di)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_broadcast_i64x2(__A),
- (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm256_extractf64x2_pd(A, imm) \
- ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v2df)_mm_undefined_pd(), \
- (__mmask8)-1))
-
-#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \
- ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_extractf64x2_pd(U, A, imm) \
- ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm256_extracti64x2_epi64(A, imm) \
- ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
- (int)(imm), \
- (__v2di)_mm_undefined_si128(), \
- (__mmask8)-1))
-
-#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \
- ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
- (int)(imm), \
- (__v2di)(__m128i)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \
- ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \
- (int)(imm), \
- (__v2di)_mm_setzero_si128(), \
- (__mmask8)(U)))
-
-#define _mm256_insertf64x2(A, B, imm) \
- ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \
- (__v2df)(__m128d)(B), (int)(imm)))
-
-#define _mm256_mask_insertf64x2(W, U, A, B, imm) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
- (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_insertf64x2(U, A, B, imm) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_insertf64x2((A), (B), (imm)), \
- (__v4df)_mm256_setzero_pd()))
-
-#define _mm256_inserti64x2(A, B, imm) \
- ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \
- (__v2di)(__m128i)(B), (int)(imm)))
-
-#define _mm256_mask_inserti64x2(W, U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
- (__v4di)(__m256i)(W)))
-
-#define _mm256_maskz_inserti64x2(U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_inserti64x2((A), (B), (imm)), \
- (__v4di)_mm256_setzero_si256()))
-
-#define _mm_mask_fpclass_pd_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_fpclass_pd_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm256_mask_fpclass_pd_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_fpclass_pd_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_fpclass_ps_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_fpclass_ps_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm256_mask_fpclass_ps_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_fpclass_ps_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \
- (__mmask8)-1))
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error \
- "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLFP16INTRIN_H
-#define __AVX512VLFP16INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("avx512fp16, avx512vl"), \
- __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, \
- __target__("avx512fp16, avx512vl"), \
- __min_vector_width__(128)))
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) {
- return __a[0];
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) {
- return __a[0];
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) {
- return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0};
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) {
- return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h};
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) {
- return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h,
- __h, __h, __h, __h, __h, __h, __h, __h};
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128
-_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
- _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) {
- return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1};
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256
-_mm256_set1_pch(_Float16 _Complex h) {
- return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h));
-}
-
-static __inline __m128h __DEFAULT_FN_ATTRS128
-_mm_set1_pch(_Float16 _Complex h) {
- return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h));
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256
-_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4,
- _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8,
- _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12,
- _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) {
- return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11,
- __h10, __h9, __h8, __h7, __h6, __h5,
- __h4, __h3, __h2, __h1};
-}
-
-#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \
- _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1))
-
-#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \
- h14, h15, h16) \
- _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \
- (h7), (h6), (h5), (h4), (h3), (h2), (h1))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A,
- __m256h __B) {
- return (__m256h)((__v16hf)__A + (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A,
- __m128h __B) {
- return (__m128h)((__v8hf)__A + (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
- (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A,
- __m256h __B) {
- return (__m256h)((__v16hf)__A - (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A,
- __m128h __B) {
- return (__m128h)((__v8hf)__A - (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
- (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A,
- __m256h __B) {
- return (__m256h)((__v16hf)__A * (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A,
- __m128h __B) {
- return (__m128h)((__v8hf)__A * (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
- (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A,
- __m256h __B) {
- return (__m256h)((__v16hf)__A / (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A,
- __m128h __B) {
- return (__m128h)((__v8hf)__A / (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
- (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A,
- __m256h __B) {
- return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
- (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
- (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A,
- __m256h __B) {
- return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
- (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
- (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) {
- return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) {
- return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) {
- return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f));
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_selectps_256(
- (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_selectps_256(
- (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) {
- return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f));
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W,
- __mmask8 __U,
- __m128h __A) {
- return (__m128h)__builtin_ia32_selectps_128(
- (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) {
- return (__m128h)__builtin_ia32_selectps_128(
- (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps());
-}
-
-#define _mm256_cmp_ph_mask(a, b, p) \
- ((__mmask16)__builtin_ia32_cmpph256_mask( \
- (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1))
-
-#define _mm256_mask_cmp_ph_mask(m, a, b, p) \
- ((__mmask16)__builtin_ia32_cmpph256_mask( \
- (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m)))
-
-#define _mm_cmp_ph_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpph128_mask( \
- (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1))
-
-#define _mm_mask_cmp_ph_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpph128_mask( \
- (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m)))
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) {
- return (__m256h)__builtin_ia32_rcpph256_mask(
- (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W,
- (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_rcpph256_mask(
- (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) {
- return (__m128h)__builtin_ia32_rcpph128_mask(
- (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A) {
- return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U,
- __m128h __A) {
- return (__m128h)__builtin_ia32_rcpph128_mask(
- (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) {
- return (__m256h)__builtin_ia32_rsqrtph256_mask(
- (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W,
- (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_rsqrtph256_mask(
- (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) {
- return (__m128h)__builtin_ia32_rsqrtph128_mask(
- (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A) {
- return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) {
- return (__m128h)__builtin_ia32_rsqrtph128_mask(
- (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) {
- return (__m128h)__builtin_ia32_getexpph128_mask(
- (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) {
- return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) {
- return (__m128h)__builtin_ia32_getexpph128_mask(
- (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) {
- return (__m256h)__builtin_ia32_getexpph256_mask(
- (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W,
- (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_getexpph256_mask(
- (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-#define _mm_getmant_ph(A, B, C) \
- ((__m128h)__builtin_ia32_getmantph128_mask( \
- (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1))
-
-#define _mm_mask_getmant_ph(W, U, A, B, C) \
- ((__m128h)__builtin_ia32_getmantph128_mask( \
- (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \
- (__mmask8)(U)))
-
-#define _mm_maskz_getmant_ph(U, A, B, C) \
- ((__m128h)__builtin_ia32_getmantph128_mask( \
- (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U)))
-
-#define _mm256_getmant_ph(A, B, C) \
- ((__m256h)__builtin_ia32_getmantph256_mask( \
- (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
- (__v16hf)_mm256_setzero_ph(), (__mmask16)-1))
-
-#define _mm256_mask_getmant_ph(W, U, A, B, C) \
- ((__m256h)__builtin_ia32_getmantph256_mask( \
- (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \
- (__mmask16)(U)))
-
-#define _mm256_maskz_getmant_ph(U, A, B, C) \
- ((__m256h)__builtin_ia32_getmantph256_mask( \
- (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \
- (__v16hf)_mm256_setzero_ph(), (__mmask16)(U)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_scalefph128_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_scalefph128_mask(
- (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A,
- __m256h __B) {
- return (__m256h)__builtin_ia32_scalefph256_mask(
- (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B,
- (__v16hf)__W, (__mmask16)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_scalefph256_mask(
- (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U);
-}
-
-#define _mm_roundscale_ph(A, imm) \
- ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
- (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1))
-
-#define _mm_mask_roundscale_ph(W, U, A, imm) \
- ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
- (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
-
-#define _mm_maskz_roundscale_ph(U, A, imm) \
- ((__m128h)__builtin_ia32_rndscaleph_128_mask( \
- (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U)))
-
-#define _mm256_roundscale_ph(A, imm) \
- ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
- (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
- (__mmask16)-1))
-
-#define _mm256_mask_roundscale_ph(W, U, A, imm) \
- ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
- (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \
- (__mmask16)(U)))
-
-#define _mm256_maskz_roundscale_ph(U, A, imm) \
- ((__m256h)__builtin_ia32_rndscaleph_256_mask( \
- (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \
- (__mmask16)(U)))
-
-#define _mm_reduce_ph(A, imm) \
- ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
- (__v8hf)_mm_setzero_ph(), \
- (__mmask8)-1))
-
-#define _mm_mask_reduce_ph(W, U, A, imm) \
- ((__m128h)__builtin_ia32_reduceph128_mask( \
- (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U)))
-
-#define _mm_maskz_reduce_ph(U, A, imm) \
- ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \
- (__v8hf)_mm_setzero_ph(), \
- (__mmask8)(U)))
-
-#define _mm256_reduce_ph(A, imm) \
- ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
- (__v16hf)_mm256_setzero_ph(), \
- (__mmask16)-1))
-
-#define _mm256_mask_reduce_ph(W, U, A, imm) \
- ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
- (__v16hf)(__m256h)(W), \
- (__mmask16)(U)))
-
-#define _mm256_maskz_reduce_ph(U, A, imm) \
- ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \
- (__v16hf)_mm256_setzero_ph(), \
- (__mmask16)(U)))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) {
- return __builtin_ia32_sqrtph((__v8hf)__a);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W,
- __mmask8 __U,
- __m128h __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U,
- __m128h __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) {
- return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) {
- return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
- (__v16hf)_mm256_sqrt_ph(__A),
- (__v16hf)_mm256_setzero_ph());
-}
-
-#define _mm_mask_fpclass_ph_mask(U, A, imm) \
- ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
- (int)(imm), (__mmask8)(U)))
-
-#define _mm_fpclass_ph_mask(A, imm) \
- ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \
- (int)(imm), (__mmask8)-1))
-
-#define _mm256_mask_fpclass_ph_mask(U, A, imm) \
- ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
- (int)(imm), (__mmask16)(U)))
-
-#define _mm256_fpclass_ph_mask(A, imm) \
- ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \
- (int)(imm), (__mmask16)-1))
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
- (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W,
- __mmask8 __U,
- __m128d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph128_mask(
- (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
- (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) {
- return (__m128h)__builtin_ia32_vcvtpd2ph256_mask(
- (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) {
- return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
- (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W,
- __mmask8 __U,
- __m128h __A) {
- return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
- return (__m128d)__builtin_ia32_vcvtph2pd128_mask(
- (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) {
- return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
- (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) {
- return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) {
- return (__m256d)__builtin_ia32_vcvtph2pd256_mask(
- (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2w128_mask(
- (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2w128_mask(
- (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epi16(__m256h __A) {
- return (__m256i)__builtin_ia32_vcvtph2w256_mask(
- (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W,
- (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvtph2w256_mask(
- (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2w128_mask(
- (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2w128_mask(
- (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epi16(__m256h __A) {
- return (__m256i)__builtin_ia32_vcvttph2w256_mask(
- (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W,
- (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvttph2w256_mask(
- (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) {
- return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_cvtepi16_ph(__m256i __A) {
- return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) {
- return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
- (__v16hf)_mm256_cvtepi16_ph(__A),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
- (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2uw128_mask(
- (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epu16(__m256h __A) {
- return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
- (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W,
- (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvtph2uw256_mask(
- (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
- (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2uw128_mask(
- (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epu16(__m256h __A) {
- return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
- (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W,
- (__mmask16)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) {
- return (__m256i)__builtin_ia32_vcvttph2uw256_mask(
- (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) {
- return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_cvtepu16_ph(__m256i __A) {
- return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) {
- return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U,
- (__v16hf)_mm256_cvtepu16_ph(__A),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
- (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2dq128_mask(
- (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epi32(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
- (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2dq256_mask(
- (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
- (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2udq128_mask(
- (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epu32(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
- (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2udq256_mask(
- (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) {
- return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
- (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtdq2ph128_mask(
- (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_ph(__m256i __A) {
- return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) {
- return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
- (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtudq2ph128_mask(
- (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_ph(__m256i __A) {
- return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
- (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2dq128_mask(
- (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epi32(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
- (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2dq256_mask(
- (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
- (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2udq128_mask(
- (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epu32(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
- (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2udq256_mask(
- (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
- (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph128_mask(
- (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_ph(__m256i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
- (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_vcvtqq2ph256_mask(
- (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
- (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2qq128_mask(
- (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epi64(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
- (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2qq256_mask(
- (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
- (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask(
- (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_cvtepu64_ph(__m256i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
- (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) {
- return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask(
- (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
- (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvtph2uqq128_mask(
- (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtph_epu64(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
- (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvtph2uqq256_mask(
- (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
- (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2qq128_mask(
- (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epi64(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
- (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2qq256_mask(
- (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
- (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
- return (__m128i)__builtin_ia32_vcvttph2uqq128_mask(
- (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttph_epu64(__m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
- (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) {
- return (__m256i)__builtin_ia32_vcvttph2uqq256_mask(
- (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) {
- return (__m128)__builtin_ia32_vcvtph2psx128_mask(
- (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W,
- __mmask8 __U,
- __m128h __A) {
- return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
- return (__m128)__builtin_ia32_vcvtph2psx128_mask(
- (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) {
- return (__m256)__builtin_ia32_vcvtph2psx256_mask(
- (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) {
- return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) {
- return (__m256)__builtin_ia32_vcvtph2psx256_mask(
- (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) {
- return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
- (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W,
- __mmask8 __U,
- __m128 __A) {
- return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) {
- return (__m128h)__builtin_ia32_vcvtps2phx128_mask(
- (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) {
- return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
- (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) {
- return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W,
- (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) {
- return (__m128h)__builtin_ia32_vcvtps2phx256_mask(
- (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A,
- __mmask8 __U,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B,
- -(__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A,
- __mmask8 __U,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A,
- __m256h __B,
- __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
- (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A,
- __m256h __B,
- __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B,
- -(__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
- (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
- (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B,
- -(__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
- (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
- (__v8hf)_mm_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
- (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B,
- -(__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
- (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
- (__v16hf)_mm256_setzero_ph());
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
- (__v8hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
- (__v16hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C),
- (__v8hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C),
- (__v16hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
- (__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C),
- (__v8hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A,
- __m256h __B,
- __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
- (__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C),
- (__v16hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B,
- -(__v8hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
- (__v8hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_selectph_128(
- (__mmask8)__U,
- __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C),
- (__v8hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A,
- __m256h __B,
- __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B,
- -(__v16hf)__C);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
- (__v16hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) {
- return (__m256h)__builtin_ia32_selectph_256(
- (__mmask16)__U,
- __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C),
- (__v16hf)__C);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfcmulcph128_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_vfcmulcph128_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A,
- __m256h __B) {
- return (__m256h)__builtin_ia32_vfcmulcph256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B,
- (__v8sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_vfcmulcph256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectps_128(
- __U,
- __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B,
- (__v4sf)__C, (__mmask8)__U),
- (__v4sf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_vfcmaddcph128_maskz(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A,
- __m256h __B,
- __m256h __C) {
- return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
- (__v8sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectps_256(
- __U,
- __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
- (__mmask8)__U),
- (__v8sf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
- return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
- (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_vfcmaddcph256_maskz(
- (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfmulcph128_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W,
- __mmask8 __U,
- __m128h __A,
- __m128h __B) {
- return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) {
- return (__m128h)__builtin_ia32_vfmulcph128_mask(
- (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A,
- __m256h __B) {
- return (__m256h)__builtin_ia32_vfmulcph256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B,
- (__v8sf)__W, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) {
- return (__m256h)__builtin_ia32_vfmulcph256_mask(
- (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A,
- __m128h __B,
- __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_selectps_128(
- __U,
- __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C,
- (__mmask8)__U),
- (__v4sf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) {
- return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) {
- return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B,
- (__v4sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A,
- __m256h __B,
- __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
- (__v8sf)__C, (__mmask8)-1);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_selectps_256(
- __U,
- __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C,
- (__mmask8)__U),
- (__v8sf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) {
- return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B,
- (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) {
- return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B,
- (__v8sf)__C, (__mmask8)__U);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U,
- __m128h __A,
- __m128h __W) {
- return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W,
- (__v8hf)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) {
- return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W,
- (__v16hf)__A);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) {
- return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I,
- (__v8hi)__B);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) {
- return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I,
- (__v16hi)__B);
-}
-
-static __inline__ __m128h __DEFAULT_FN_ATTRS128
-_mm_permutexvar_ph(__m128i __A, __m128h __B) {
- return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A);
-}
-
-static __inline__ __m256h __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_ph(__m256i __A, __m256h __B) {
- return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_add_ph(__m256h __W) {
- return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_mul_ph(__m256h __W) {
- return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_max_ph(__m256h __V) {
- return __builtin_ia32_reduce_fmax_ph256(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS256
-_mm256_reduce_min_ph(__m256h __V) {
- return __builtin_ia32_reduce_fmin_ph256(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_add_ph(__m128h __W) {
- return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_mul_ph(__m128h __W) {
- return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_max_ph(__m128h __V) {
- return __builtin_ia32_reduce_fmax_ph128(__V);
-}
-
-static __inline__ _Float16 __DEFAULT_FN_ATTRS128
-_mm_reduce_min_ph(__m128h __V) {
- return __builtin_ia32_reduce_fmin_ph128(__V);
-}
-
-// intrinsics below are alias for f*mul_*ch
-#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B)
-#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B)
-#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B)
-#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B)
-#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B)
-#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B)
-
-#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B)
-#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B)
-#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B)
-#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B)
-#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B)
-#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B)
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLINTRIN_H
-#define __AVX512VLINTRIN_H
-
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256)))
-
-typedef short __v2hi __attribute__((__vector_size__(4)));
-typedef char __v4qi __attribute__((__vector_size__(4)));
-typedef char __v2qi __attribute__((__vector_size__(2)));
-
-/* Integer compare */
-
-#define _mm_cmpeq_epi32_mask(A, B) \
- _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi32_mask(k, A, B) \
- _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi32_mask(A, B) \
- _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi32_mask(k, A, B) \
- _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi32_mask(A, B) \
- _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi32_mask(k, A, B) \
- _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi32_mask(A, B) \
- _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi32_mask(k, A, B) \
- _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi32_mask(A, B) \
- _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi32_mask(k, A, B) \
- _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi32_mask(A, B) \
- _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi32_mask(k, A, B) \
- _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi32_mask(A, B) \
- _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \
- _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi32_mask(A, B) \
- _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi32_mask(k, A, B) \
- _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi32_mask(A, B) \
- _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \
- _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi32_mask(A, B) \
- _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi32_mask(k, A, B) \
- _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi32_mask(A, B) \
- _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi32_mask(k, A, B) \
- _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi32_mask(A, B) \
- _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \
- _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu32_mask(A, B) \
- _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu32_mask(k, A, B) \
- _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu32_mask(A, B) \
- _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu32_mask(k, A, B) \
- _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu32_mask(A, B) \
- _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu32_mask(k, A, B) \
- _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu32_mask(A, B) \
- _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu32_mask(k, A, B) \
- _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu32_mask(A, B) \
- _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu32_mask(k, A, B) \
- _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu32_mask(A, B) \
- _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu32_mask(k, A, B) \
- _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu32_mask(A, B) \
- _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \
- _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu32_mask(A, B) \
- _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu32_mask(k, A, B) \
- _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu32_mask(A, B) \
- _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \
- _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu32_mask(A, B) \
- _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu32_mask(k, A, B) \
- _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu32_mask(A, B) \
- _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu32_mask(k, A, B) \
- _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu32_mask(A, B) \
- _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \
- _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epi64_mask(A, B) \
- _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epi64_mask(k, A, B) \
- _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epi64_mask(A, B) \
- _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epi64_mask(k, A, B) \
- _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epi64_mask(A, B) \
- _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epi64_mask(k, A, B) \
- _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epi64_mask(A, B) \
- _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epi64_mask(k, A, B) \
- _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epi64_mask(A, B) \
- _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epi64_mask(k, A, B) \
- _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epi64_mask(A, B) \
- _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epi64_mask(k, A, B) \
- _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epi64_mask(A, B) \
- _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \
- _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epi64_mask(A, B) \
- _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epi64_mask(k, A, B) \
- _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epi64_mask(A, B) \
- _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \
- _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epi64_mask(A, B) \
- _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epi64_mask(k, A, B) \
- _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epi64_mask(A, B) \
- _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epi64_mask(k, A, B) \
- _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epi64_mask(A, B) \
- _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \
- _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm_cmpeq_epu64_mask(A, B) \
- _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm_mask_cmpeq_epu64_mask(k, A, B) \
- _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm_cmpge_epu64_mask(A, B) \
- _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm_mask_cmpge_epu64_mask(k, A, B) \
- _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm_cmpgt_epu64_mask(A, B) \
- _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm_mask_cmpgt_epu64_mask(k, A, B) \
- _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm_cmple_epu64_mask(A, B) \
- _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm_mask_cmple_epu64_mask(k, A, B) \
- _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm_cmplt_epu64_mask(A, B) \
- _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm_mask_cmplt_epu64_mask(k, A, B) \
- _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm_cmpneq_epu64_mask(A, B) \
- _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm_mask_cmpneq_epu64_mask(k, A, B) \
- _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-#define _mm256_cmpeq_epu64_mask(A, B) \
- _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ)
-#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \
- _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ)
-#define _mm256_cmpge_epu64_mask(A, B) \
- _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE)
-#define _mm256_mask_cmpge_epu64_mask(k, A, B) \
- _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE)
-#define _mm256_cmpgt_epu64_mask(A, B) \
- _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT)
-#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \
- _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT)
-#define _mm256_cmple_epu64_mask(A, B) \
- _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE)
-#define _mm256_mask_cmple_epu64_mask(k, A, B) \
- _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE)
-#define _mm256_cmplt_epu64_mask(A, B) \
- _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT)
-#define _mm256_mask_cmplt_epu64_mask(k, A, B) \
- _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT)
-#define _mm256_cmpneq_epu64_mask(A, B) \
- _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE)
-#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \
- _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE)
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_add_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_add_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_add_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_add_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sub_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sub_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_sub_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_sub_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_add_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_add_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_add_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_add_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sub_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sub_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_sub_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_sub_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_mul_epi32(__X, __Y),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_mul_epi32(__X, __Y),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_mul_epi32(__X, __Y),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_mul_epi32(__X, __Y),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_mul_epu32(__X, __Y),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_mul_epu32(__X, __Y),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_mul_epu32(__X, __Y),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_mul_epu32(__X, __Y),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_mullo_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_mullo_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_mullo_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_mullo_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_and_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8su)__a & (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_and_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_and_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v4su)__a & (__v4su)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_and_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_andnot_epi32(__m256i __A, __m256i __B)
-{
- return (__m256i)(~(__v8su)__A & (__v8su)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_andnot_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(),
- __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_andnot_epi32(__m128i __A, __m128i __B)
-{
- return (__m128i)(~(__v4su)__A & (__v4su)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_andnot_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_or_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8su)__a | (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_or_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_or_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v4su)__a | (__v4su)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_or_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_xor_epi32(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v8su)__a ^ (__v8su)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_xor_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_xor_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v4su)__a ^ (__v4su)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_xor_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_and_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a & (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_and_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_and_epi64(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a & (__v2du)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_and_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_andnot_epi64(__m256i __A, __m256i __B)
-{
- return (__m256i)(~(__v4du)__A & (__v4du)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_andnot_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(),
- __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_andnot_epi64(__m128i __A, __m128i __B)
-{
- return (__m128i)(~(__v2du)__A & (__v2du)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_andnot_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_or_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a | (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_or_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_or_epi64(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a | (__v2du)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_or_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_xor_epi64(__m256i __a, __m256i __b)
-{
- return (__m256i)((__v4du)__a ^ (__v4du)__b);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_xor_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_xor_epi64(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a ^ (__v2du)__b);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A,
- __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_xor_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B);
-}
-
-#define _mm_cmp_epi32_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
- (__v4si)(__m128i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_epi32_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \
- (__v4si)(__m128i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm_cmp_epu32_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
- (__v4si)(__m128i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_epu32_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \
- (__v4si)(__m128i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm256_cmp_epi32_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
- (__v8si)(__m256i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \
- (__v8si)(__m256i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm256_cmp_epu32_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
- (__v8si)(__m256i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \
- (__v8si)(__m256i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm_cmp_epi64_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
- (__v2di)(__m128i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_epi64_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \
- (__v2di)(__m128i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm_cmp_epu64_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
- (__v2di)(__m128i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_epu64_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \
- (__v2di)(__m128i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm256_cmp_epi64_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
- (__v4di)(__m256i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \
- (__v4di)(__m256i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm256_cmp_epu64_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
- (__v4di)(__m256i)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \
- (__v4di)(__m256i)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm256_cmp_ps_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
- (__v8sf)(__m256)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm256_mask_cmp_ps_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \
- (__v8sf)(__m256)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm256_cmp_pd_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
- (__v4df)(__m256d)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm256_mask_cmp_pd_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \
- (__v4df)(__m256d)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm_cmp_ps_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
- (__v4sf)(__m128)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_ps_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \
- (__v4sf)(__m128)(b), (int)(p), \
- (__mmask8)(m)))
-
-#define _mm_cmp_pd_mask(a, b, p) \
- ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
- (__v2df)(__m128d)(b), (int)(p), \
- (__mmask8)-1))
-
-#define _mm_mask_cmp_pd_mask(m, a, b, p) \
- ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \
- (__v2df)(__m128d)(b), (int)(p), \
- (__mmask8)(m)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- (__v2df) __B,
- -(__v2df) __C),
- (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- (__v2df) __B,
- -(__v2df) __C),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd (-(__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd (-(__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd (-(__v2df) __A,
- (__v2df) __B,
- -(__v2df) __C),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- (__v4df) __B,
- -(__v4df) __C),
- (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- (__v4df) __B,
- -(__v4df) __C),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 (-(__v4df) __A,
- (__v4df) __B,
- -(__v4df) __C),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- (__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- (__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps (-(__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps (-(__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps (-(__v4sf) __A,
- (__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- (__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- (__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 (-(__v8sf) __A,
- (__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __C),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd ((__v2df) __A,
- (__v2df) __B,
- -(__v2df) __C),
- (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd ((__v2df) __A,
- (__v2df) __B,
- -(__v2df) __C),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __C),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
- (__v4df) __B,
- -(__v4df) __C),
- (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
- (__v4df) __B,
- -(__v4df) __C),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubps ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubps ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubps ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __C),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubps ((__v4sf) __A,
- (__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubps ((__v4sf) __A,
- (__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B,
- __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __C),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
- (__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
- (__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- (__v2df) __B,
- -(__v2df) __C),
- (__v2df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- (__v4df) __B,
- -(__v4df) __C),
- (__v4df) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- (__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- (__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd ((__v2df) __A,
- (__v2df) __B,
- -(__v2df) __C),
- (__v2df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubpd256 ((__v4df) __A,
- (__v4df) __B,
- -(__v4df) __C),
- (__v4df) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddsubps ((__v4sf) __A,
- (__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddsubps256 ((__v8sf) __A,
- (__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- -(__v2df) __B,
- (__v2df) __C),
- (__v2df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- -(__v4df) __B,
- (__v4df) __C),
- (__v4df) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- -(__v4sf) __B,
- (__v4sf) __C),
- (__v4sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- -(__v8sf) __B,
- (__v8sf) __C),
- (__v8sf) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- -(__v2df) __B,
- -(__v2df) __C),
- (__v2df) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U)
-{
- return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U,
- __builtin_ia32_vfmaddpd ((__v2df) __A,
- -(__v2df) __B,
- -(__v2df) __C),
- (__v2df) __C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- -(__v4df) __B,
- -(__v4df) __C),
- (__v4df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U)
-{
- return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U,
- __builtin_ia32_vfmaddpd256 ((__v4df) __A,
- -(__v4df) __B,
- -(__v4df) __C),
- (__v4df) __C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- -(__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
-{
- return (__m128) __builtin_ia32_selectps_128((__mmask8) __U,
- __builtin_ia32_vfmaddps ((__v4sf) __A,
- -(__v4sf) __B,
- -(__v4sf) __C),
- (__v4sf) __C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- -(__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U)
-{
- return (__m256) __builtin_ia32_selectps_256((__mmask8) __U,
- __builtin_ia32_vfmaddps256 ((__v8sf) __A,
- -(__v8sf) __B,
- -(__v8sf) __C),
- (__v8sf) __C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_add_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_add_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_add_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_add_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_add_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_add_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_add_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_add_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) {
- return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
- (__v4si) __W,
- (__v4si) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) {
- return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
- (__v8si) __W,
- (__v8si) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) {
- return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
- (__v2df) __W,
- (__v2df) __A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) {
- return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
- (__v4df) __W,
- (__v4df) __A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) {
- return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
- (__v4sf) __W,
- (__v4sf) __A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) {
- return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
- (__v8sf) __W,
- (__v8sf) __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) {
- return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
- (__v2di) __W,
- (__v2di) __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) {
- return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
- (__v4di) __W,
- (__v4di) __A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
- (__v8si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) {
- __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
- (__v2df) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) {
- __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
- (__v4df) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) {
- __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
- (__v2di) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) {
- __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
- (__v4di) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) {
- __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
- (__v4sf) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) {
- __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
- (__v8sf) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) {
- __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
- (__v4si) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) {
- __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
- (__v8si) __A,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
- (__v2df)_mm_cvtepi32_pd(__A),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
- (__v2df)_mm_cvtepi32_pd(__A),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
- (__v4df)_mm256_cvtepi32_pd(__A),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
- (__v4df)_mm256_cvtepi32_pd(__A),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_cvtepi32_ps(__A),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_cvtepi32_ps(__A),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_cvtepi32_ps(__A),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_cvtepi32_ps(__A),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm256_cvtpd_epi32(__A),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm256_cvtpd_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) {
- return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) {
- return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm256_cvtpd_ps(__A),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm256_cvtpd_ps(__A),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtpd_epu32 (__m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtpd_epu32 (__m256d __A) {
- return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
- return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) {
- return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtps_epi32(__A),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtps_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtps_epi32(__A),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtps_epi32(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_cvtps_pd(__A),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_cvtps_pd(__A),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_cvtps_pd(__A),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_cvtps_pd(__A),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtps_epu32 (__m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvtps_epu32 (__m256 __A) {
- return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
- return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
- (__v8si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) {
- return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm256_cvttpd_epi32(__A),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm256_cvttpd_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttpd_epu32 (__m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) {
- return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvttpd_epu32 (__m256d __A) {
- return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) {
- return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) {
- return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvttps_epi32(__A),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvttps_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvttps_epi32(__A),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvttps_epi32(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvttps_epu32 (__m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) {
- return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cvttps_epu32 (__m256 __A) {
- return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) {
- return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
- (__v8si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) {
- return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_cvtepu32_pd (__m128i __A) {
- return (__m128d) __builtin_convertvector(
- __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
- (__v2df)_mm_cvtepu32_pd(__A),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U,
- (__v2df)_mm_cvtepu32_pd(__A),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_pd (__m128i __A) {
- return (__m256d)__builtin_convertvector((__v4su)__A, __v4df);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
- (__v4df)_mm256_cvtepu32_pd(__A),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U,
- (__v4df)_mm256_cvtepu32_pd(__A),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtepu32_ps (__m128i __A) {
- return (__m128)__builtin_convertvector((__v4su)__A, __v4sf);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_cvtepu32_ps(__A),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_cvtepu32_ps(__A),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtepu32_ps (__m256i __A) {
- return (__m256)__builtin_convertvector((__v8su)__A, __v8sf);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_cvtepu32_ps(__A),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_cvtepu32_ps(__A),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_div_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_div_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_div_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_div_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_div_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_div_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_div_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_div_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) {
- return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
- (__v2df) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
- return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) {
- return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
- (__v4df) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) {
- return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) {
- return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
- (__v2di) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
- return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
- void const *__P) {
- return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
- (__v4di) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) {
- return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) {
- return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
- return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) {
- return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) {
- return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) {
- return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
- (__v4si) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
- return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
- void const *__P) {
- return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
- (__v8si) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) {
- return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) {
- return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
- (__v8si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) {
- return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_getexp_pd (__m128d __A) {
- return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) {
- return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_getexp_pd (__m256d __A) {
- return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) {
- return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_getexp_ps (__m128 __A) {
- return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) {
- return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_getexp_ps (__m256 __A) {
- return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) {
- return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_max_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_max_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_max_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_max_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_max_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_max_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_max_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_max_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_min_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_min_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_min_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_min_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_min_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_min_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_min_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_min_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_mul_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_mul_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_mul_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_mul_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_mul_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_mul_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_mul_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_mul_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_abs_epi32(__A),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_abs_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_abs_epi32(__A),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_abs_epi32(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_abs_epi64 (__m128i __A) {
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pabsq128((__v2di)__A);
-#else
- return (__m128i)__builtin_elementwise_abs((__v2di)__A);
-#endif
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_abs_epi64(__A),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_abs_epi64(__A),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi64 (__m256i __A) {
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pabsq256 ((__v4di)__A);
-#else
- return (__m256i)__builtin_elementwise_abs((__v4di)__A);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_abs_epi64(__A),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_abs_epi64(__A),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_max_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_max_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_max_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_max_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_max_epi64 (__m128i __A, __m128i __B) {
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pmaxsq128((__v2di)__A, (__v2di)__B);
-#else
- return (__m128i)__builtin_elementwise_max((__v2di)__A, (__v2di)__B);
-#endif
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_max_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_max_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi64 (__m256i __A, __m256i __B) {
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxsq256((__v4di)__A, (__v4di)__B);
-#else
- return (__m256i)__builtin_elementwise_max((__v4di)__A, (__v4di)__B);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_max_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_max_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_max_epu32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_max_epu32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_max_epu32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_max_epu32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_max_epu64 (__m128i __A, __m128i __B) {
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pmaxuq128((__v2di)__A, (__v2di)__B);
-#else
- return (__m128i)__builtin_elementwise_max((__v2du)__A, (__v2du)__B);
-#endif
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_max_epu64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_max_epu64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu64 (__m256i __A, __m256i __B) {
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pmaxuq256((__v4di)__A, (__v4di)__B);
-#else
- return (__m256i)__builtin_elementwise_max((__v4du)__A, (__v4du)__B);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_max_epu64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_max_epu64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_min_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_min_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_min_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_min_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_min_epi64 (__m128i __A, __m128i __B) {
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pminsq128((__v2di)__A, (__v2di)__B);
-#else
- return (__m128i)__builtin_elementwise_min((__v2di)__A, (__v2di)__B);
-#endif
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_min_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_min_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi64 (__m256i __A, __m256i __B) {
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminsq256((__v4di)__A, (__v4di)__B);
-#else
- return (__m256i)__builtin_elementwise_min((__v4di)__A, (__v4di)__B);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_min_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_min_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_min_epu32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm_min_epu32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_min_epu32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_min_epu32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_min_epu64 (__m128i __A, __m128i __B) {
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pminuq128((__v2di)__A, (__v2di)__B);
-#else
- return (__m128i)__builtin_elementwise_min((__v2du)__A, (__v2du)__B);
-#endif
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_min_epu64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M,
- (__v2di)_mm_min_epu64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu64 (__m256i __A, __m256i __B) {
-#if (__clang_major__ < 14)
- return (__m256i)__builtin_ia32_pminuq256((__v4di)__A, (__v4di)__B);
-#else
- return (__m256i)__builtin_elementwise_min((__v4du)__A, (__v4du)__B);
-#endif
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_min_epu64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_min_epu64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm_roundscale_pd(A, imm) \
- ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1))
-
-
-#define _mm_mask_roundscale_pd(W, U, A, imm) \
- ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
- (int)(imm), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)))
-
-
-#define _mm_maskz_roundscale_pd(U, A, imm) \
- ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \
- (int)(imm), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)))
-
-
-#define _mm256_roundscale_pd(A, imm) \
- ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)-1))
-
-
-#define _mm256_mask_roundscale_pd(W, U, A, imm) \
- ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)))
-
-
-#define _mm256_maskz_roundscale_pd(U, A, imm) \
- ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \
- (int)(imm), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm_roundscale_ps(A, imm) \
- ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1))
-
-
-#define _mm_mask_roundscale_ps(W, U, A, imm) \
- ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)))
-
-
-#define _mm_maskz_roundscale_ps(U, A, imm) \
- ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm256_roundscale_ps(A, imm) \
- ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1))
-
-#define _mm256_mask_roundscale_ps(W, U, A, imm) \
- ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
- (__v8sf)(__m256)(W), \
- (__mmask8)(U)))
-
-
-#define _mm256_maskz_roundscale_ps(U, A, imm) \
- ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_scalef_pd (__m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
- __m128d __B) {
- return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
- (__v2df) __B,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_scalef_pd (__m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
- __m256d __B) {
- return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
- (__v4df) __B,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_scalef_ps (__m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
- (__v4sf) __B,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_scalef_ps (__m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
- __m256 __B) {
- return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
- (__v8sf) __B,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-#define _mm_i64scatter_pd(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)-1, \
- (__v2di)(__m128i)(index), \
- (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)(mask), \
- (__v2di)(__m128i)(index), \
- (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_i64scatter_epi64(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)-1, \
- (__v2di)(__m128i)(index), \
- (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)(mask), \
- (__v2di)(__m128i)(index), \
- (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i64scatter_pd(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)-1, \
- (__v4di)(__m256i)(index), \
- (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)(mask), \
- (__v4di)(__m256i)(index), \
- (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_i64scatter_epi64(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)-1, \
- (__v4di)(__m256i)(index), \
- (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)(mask), \
- (__v4di)(__m256i)(index), \
- (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm_i64scatter_ps(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)-1, \
- (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
- (int)(scale))
-
-#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)(mask), \
- (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \
- (int)(scale))
-
-#define _mm_i64scatter_epi32(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)-1, \
- (__v2di)(__m128i)(index), \
- (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)(mask), \
- (__v2di)(__m128i)(index), \
- (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i64scatter_ps(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)-1, \
- (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
- (int)(scale))
-
-#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)(mask), \
- (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \
- (int)(scale))
-
-#define _mm256_i64scatter_epi32(addr, index, v1, scale) \
- __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)-1, \
- (__v4di)(__m256i)(index), \
- (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \
- __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)(mask), \
- (__v4di)(__m256i)(index), \
- (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm_i32scatter_pd(addr, index, v1, scale) \
- __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)-1, \
- (__v4si)(__m128i)(index), \
- (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)(mask), \
- (__v4si)(__m128i)(index), \
- (__v2df)(__m128d)(v1), (int)(scale))
-
-#define _mm_i32scatter_epi64(addr, index, v1, scale) \
- __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)-1, \
- (__v4si)(__m128i)(index), \
- (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)(mask), \
- (__v4si)(__m128i)(index), \
- (__v2di)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i32scatter_pd(addr, index, v1, scale) \
- __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)-1, \
- (__v4si)(__m128i)(index), \
- (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)(mask), \
- (__v4si)(__m128i)(index), \
- (__v4df)(__m256d)(v1), (int)(scale))
-
-#define _mm256_i32scatter_epi64(addr, index, v1, scale) \
- __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)-1, \
- (__v4si)(__m128i)(index), \
- (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)(mask), \
- (__v4si)(__m128i)(index), \
- (__v4di)(__m256i)(v1), (int)(scale))
-
-#define _mm_i32scatter_ps(addr, index, v1, scale) \
- __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)-1, \
- (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
- (int)(scale))
-
-#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)(mask), \
- (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \
- (int)(scale))
-
-#define _mm_i32scatter_epi32(addr, index, v1, scale) \
- __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)-1, \
- (__v4si)(__m128i)(index), \
- (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)(mask), \
- (__v4si)(__m128i)(index), \
- (__v4si)(__m128i)(v1), (int)(scale))
-
-#define _mm256_i32scatter_ps(addr, index, v1, scale) \
- __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)-1, \
- (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
- (int)(scale))
-
-#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)(mask), \
- (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \
- (int)(scale))
-
-#define _mm256_i32scatter_epi32(addr, index, v1, scale) \
- __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)-1, \
- (__v8si)(__m256i)(index), \
- (__v8si)(__m256i)(v1), (int)(scale))
-
-#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \
- __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)(mask), \
- (__v8si)(__m256i)(index), \
- (__v8si)(__m256i)(v1), (int)(scale))
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_sqrt_pd(__A),
- (__v2df)__W);
- }
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_sqrt_pd(__A),
- (__v2df)_mm_setzero_pd());
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_sqrt_pd(__A),
- (__v4df)__W);
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_sqrt_pd(__A),
- (__v4df)_mm256_setzero_pd());
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_sqrt_ps(__A),
- (__v4sf)__W);
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_sqrt_ps(__A),
- (__v4sf)_mm_setzero_ps());
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_sqrt_ps(__A),
- (__v8sf)__W);
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_sqrt_ps(__A),
- (__v8sf)_mm256_setzero_ps());
- }
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_sub_pd(__A, __B),
- (__v2df)__W);
- }
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_sub_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_sub_pd(__A, __B),
- (__v4df)__W);
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_sub_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_sub_ps(__A, __B),
- (__v4sf)__W);
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_sub_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_sub_ps(__A, __B),
- (__v8sf)__W);
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_sub_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) {
- return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I,
- (__v4si)__B);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I,
- __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
- (__v4si)__A);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U,
- __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
- (__v4si)__I);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I,
- __m128i __B) {
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_permutex2var_epi32(__A, __I, __B),
- (__v4si)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) {
- return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I,
- (__v8si) __B);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I,
- __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
- (__v8si)__A);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U,
- __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
- (__v8si)__I);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I,
- __m256i __B) {
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_permutex2var_epi32(__A, __I, __B),
- (__v8si)_mm256_setzero_si256());
- }
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) {
- return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I,
- (__v2df)__B);
- }
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128(__U,
- (__v2df)_mm_permutex2var_pd(__A, __I, __B),
- (__v2df)__A);
- }
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128(__U,
- (__v2df)_mm_permutex2var_pd(__A, __I, __B),
- (__v2df)(__m128d)__I);
- }
-
- static __inline__ __m128d __DEFAULT_FN_ATTRS128
- _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) {
- return (__m128d)__builtin_ia32_selectpd_128(__U,
- (__v2df)_mm_permutex2var_pd(__A, __I, __B),
- (__v2df)_mm_setzero_pd());
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) {
- return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I,
- (__v4df)__B);
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I,
- __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256(__U,
- (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
- (__v4df)__A);
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U,
- __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256(__U,
- (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
- (__v4df)(__m256d)__I);
- }
-
- static __inline__ __m256d __DEFAULT_FN_ATTRS256
- _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I,
- __m256d __B) {
- return (__m256d)__builtin_ia32_selectpd_256(__U,
- (__v4df)_mm256_permutex2var_pd(__A, __I, __B),
- (__v4df)_mm256_setzero_pd());
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) {
- return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I,
- (__v4sf)__B);
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128(__U,
- (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
- (__v4sf)__A);
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128(__U,
- (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
- (__v4sf)(__m128)__I);
- }
-
- static __inline__ __m128 __DEFAULT_FN_ATTRS128
- _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) {
- return (__m128)__builtin_ia32_selectps_128(__U,
- (__v4sf)_mm_permutex2var_ps(__A, __I, __B),
- (__v4sf)_mm_setzero_ps());
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) {
- return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I,
- (__v8sf) __B);
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256(__U,
- (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
- (__v8sf)__A);
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U,
- __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256(__U,
- (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
- (__v8sf)(__m256)__I);
- }
-
- static __inline__ __m256 __DEFAULT_FN_ATTRS256
- _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I,
- __m256 __B) {
- return (__m256)__builtin_ia32_selectps_256(__U,
- (__v8sf)_mm256_permutex2var_ps(__A, __I, __B),
- (__v8sf)_mm256_setzero_ps());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) {
- return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I,
- (__v2di)__B);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I,
- __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
- (__v2di)__A);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U,
- __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
- (__v2di)__I);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I,
- __m128i __B) {
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_permutex2var_epi64(__A, __I, __B),
- (__v2di)_mm_setzero_si128());
- }
-
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) {
- return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I,
- (__v4di) __B);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I,
- __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
- (__v4di)__A);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U,
- __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
- (__v4di)__I);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I,
- __m256i __B) {
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_permutex2var_epi64(__A, __I, __B),
- (__v4di)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepi8_epi32(__A),
- (__v4si)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepi8_epi32(__A),
- (__v4si)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepi8_epi32(__A),
- (__v8si)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepi8_epi32(__A),
- (__v8si)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepi8_epi64(__A),
- (__v2di)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepi8_epi64(__A),
- (__v2di)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepi8_epi64(__A),
- (__v4di)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepi8_epi64(__A),
- (__v4di)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepi32_epi64(__X),
- (__v2di)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepi32_epi64(__X),
- (__v2di)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepi32_epi64(__X),
- (__v4di)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepi32_epi64(__X),
- (__v4di)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepi16_epi32(__A),
- (__v4si)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepi16_epi32(__A),
- (__v4si)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepi16_epi32(__A),
- (__v8si)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepi16_epi32(__A),
- (__v8si)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepi16_epi64(__A),
- (__v2di)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepi16_epi64(__A),
- (__v2di)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepi16_epi64(__A),
- (__v4di)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepi16_epi64(__A),
- (__v4di)_mm256_setzero_si256());
- }
-
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepu8_epi32(__A),
- (__v4si)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepu8_epi32(__A),
- (__v4si)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepu8_epi32(__A),
- (__v8si)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepu8_epi32(__A),
- (__v8si)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepu8_epi64(__A),
- (__v2di)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepu8_epi64(__A),
- (__v2di)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepu8_epi64(__A),
- (__v4di)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepu8_epi64(__A),
- (__v4di)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepu32_epi64(__X),
- (__v2di)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepu32_epi64(__X),
- (__v2di)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepu32_epi64(__X),
- (__v4di)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepu32_epi64(__X),
- (__v4di)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepu16_epi32(__A),
- (__v4si)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_cvtepu16_epi32(__A),
- (__v4si)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepu16_epi32(__A),
- (__v8si)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_cvtepu16_epi32(__A),
- (__v8si)_mm256_setzero_si256());
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepu16_epi64(__A),
- (__v2di)__W);
- }
-
- static __inline__ __m128i __DEFAULT_FN_ATTRS128
- _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
- {
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_cvtepu16_epi64(__A),
- (__v2di)_mm_setzero_si128());
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepu16_epi64(__A),
- (__v4di)__W);
- }
-
- static __inline__ __m256i __DEFAULT_FN_ATTRS256
- _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
- {
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_cvtepu16_epi64(__A),
- (__v4di)_mm256_setzero_si256());
- }
-
-
-#define _mm_rol_epi32(a, b) \
- ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_rol_epi32(w, u, a, b) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
- (__v4si)_mm_rol_epi32((a), (b)), \
- (__v4si)(__m128i)(w)))
-
-#define _mm_maskz_rol_epi32(u, a, b) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
- (__v4si)_mm_rol_epi32((a), (b)), \
- (__v4si)_mm_setzero_si128()))
-
-#define _mm256_rol_epi32(a, b) \
- ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_rol_epi32(w, u, a, b) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
- (__v8si)_mm256_rol_epi32((a), (b)), \
- (__v8si)(__m256i)(w)))
-
-#define _mm256_maskz_rol_epi32(u, a, b) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
- (__v8si)_mm256_rol_epi32((a), (b)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm_rol_epi64(a, b) \
- ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_rol_epi64(w, u, a, b) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
- (__v2di)_mm_rol_epi64((a), (b)), \
- (__v2di)(__m128i)(w)))
-
-#define _mm_maskz_rol_epi64(u, a, b) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
- (__v2di)_mm_rol_epi64((a), (b)), \
- (__v2di)_mm_setzero_si128()))
-
-#define _mm256_rol_epi64(a, b) \
- ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_rol_epi64(w, u, a, b) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
- (__v4di)_mm256_rol_epi64((a), (b)), \
- (__v4di)(__m256i)(w)))
-
-#define _mm256_maskz_rol_epi64(u, a, b) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
- (__v4di)_mm256_rol_epi64((a), (b)), \
- (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rolv_epi32 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_rolv_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_rolv_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rolv_epi32 (__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_rolv_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_rolv_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rolv_epi64 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_rolv_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_rolv_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rolv_epi64 (__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_rolv_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_rolv_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm_ror_epi32(a, b) \
- ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_ror_epi32(w, u, a, b) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
- (__v4si)_mm_ror_epi32((a), (b)), \
- (__v4si)(__m128i)(w)))
-
-#define _mm_maskz_ror_epi32(u, a, b) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \
- (__v4si)_mm_ror_epi32((a), (b)), \
- (__v4si)_mm_setzero_si128()))
-
-#define _mm256_ror_epi32(a, b) \
- ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_ror_epi32(w, u, a, b) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
- (__v8si)_mm256_ror_epi32((a), (b)), \
- (__v8si)(__m256i)(w)))
-
-#define _mm256_maskz_ror_epi32(u, a, b) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \
- (__v8si)_mm256_ror_epi32((a), (b)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm_ror_epi64(a, b) \
- ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b)))
-
-#define _mm_mask_ror_epi64(w, u, a, b) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
- (__v2di)_mm_ror_epi64((a), (b)), \
- (__v2di)(__m128i)(w)))
-
-#define _mm_maskz_ror_epi64(u, a, b) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \
- (__v2di)_mm_ror_epi64((a), (b)), \
- (__v2di)_mm_setzero_si128()))
-
-#define _mm256_ror_epi64(a, b) \
- ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b)))
-
-#define _mm256_mask_ror_epi64(w, u, a, b) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
- (__v4di)_mm256_ror_epi64((a), (b)), \
- (__v4di)(__m256i)(w)))
-
-#define _mm256_maskz_ror_epi64(u, a, b) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \
- (__v4di)_mm256_ror_epi64((a), (b)), \
- (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sll_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sll_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sll_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sll_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_slli_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_slli_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_slli_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_slli_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_sll_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_sll_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_sll_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_sll_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_slli_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_slli_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_slli_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_slli_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rorv_epi32 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_rorv_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_rorv_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rorv_epi32 (__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_rorv_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_rorv_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_rorv_epi64 (__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_rorv_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_rorv_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_rorv_epi64 (__m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_rorv_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_rorv_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_sllv_epi64(__X, __Y),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_sllv_epi64(__X, __Y),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_sllv_epi64(__X, __Y),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_sllv_epi64(__X, __Y),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sllv_epi32(__X, __Y),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sllv_epi32(__X, __Y),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sllv_epi32(__X, __Y),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sllv_epi32(__X, __Y),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srlv_epi64(__X, __Y),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srlv_epi64(__X, __Y),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srlv_epi64(__X, __Y),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srlv_epi64(__X, __Y),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srlv_epi32(__X, __Y),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srlv_epi32(__X, __Y),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srlv_epi32(__X, __Y),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srlv_epi32(__X, __Y),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srl_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srl_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srl_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srl_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srli_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srli_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srli_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srli_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srl_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srl_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srl_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srl_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srli_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srli_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srli_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srli_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srav_epi32(__X, __Y),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srav_epi32(__X, __Y),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srav_epi32(__X, __Y),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srav_epi32(__X, __Y),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srav_epi64(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srav_epi64(__X, __Y),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_srav_epi64(__X, __Y),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srav_epi64(__m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srav_epi64(__X, __Y),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_srav_epi64(__X, __Y),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
- (__v4si) __A,
- (__v4si) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U,
- (__v4si) __A,
- (__v4si) _mm_setzero_si128 ());
-}
-
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
- (__v8si) __A,
- (__v8si) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U,
- (__v8si) __A,
- (__v8si) _mm256_setzero_si256 ());
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_load_epi32 (void const *__P)
-{
- return *(const __m128i *) __P;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
- (__v4si) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8)
- __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_load_epi32 (void const *__P)
-{
- return *(const __m256i *) __P;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
- (__v8si) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8)
- __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_store_epi32 (void *__P, __m128i __A)
-{
- *(__m128i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
-{
- __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
- (__v4si) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_store_epi32 (void *__P, __m256i __A)
-{
- *(__m256i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
-{
- __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
- (__v8si) __A,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
- (__v2di) __A,
- (__v2di) __W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
-{
- return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U,
- (__v2di) __A,
- (__v2di) _mm_setzero_si128 ());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
- (__v4di) __A,
- (__v4di) __W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
-{
- return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U,
- (__v4di) __A,
- (__v4di) _mm256_setzero_si256 ());
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_load_epi64 (void const *__P)
-{
- return *(const __m128i *) __P;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
- (__v2di) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8)
- __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_load_epi64 (void const *__P)
-{
- return *(const __m256i *) __P;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
- (__v4di) __W,
- (__mmask8)
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8)
- __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_store_epi64 (void *__P, __m128i __A)
-{
- *(__m128i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
-{
- __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
- (__v2di) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_store_epi64 (void *__P, __m256i __A)
-{
- *(__m256i *) __P = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
-{
- __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
- (__v4di) __A,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_movedup_pd(__A),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_movedup_pd(__A),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_movedup_pd(__A),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_movedup_pd(__A),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A)
-{
- return (__m128i)__builtin_ia32_selectd_128(__M,
- (__v4si) _mm_set1_epi32(__A),
- (__v4si)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi32( __mmask8 __M, int __A)
-{
- return (__m128i)__builtin_ia32_selectd_128(__M,
- (__v4si) _mm_set1_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A)
-{
- return (__m256i)__builtin_ia32_selectd_256(__M,
- (__v8si) _mm256_set1_epi32(__A),
- (__v8si)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi32( __mmask8 __M, int __A)
-{
- return (__m256i)__builtin_ia32_selectd_256(__M,
- (__v8si) _mm256_set1_epi32(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
-{
- return (__m128i) __builtin_ia32_selectq_128(__M,
- (__v2di) _mm_set1_epi64x(__A),
- (__v2di) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
- return (__m128i) __builtin_ia32_selectq_128(__M,
- (__v2di) _mm_set1_epi64x(__A),
- (__v2di) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
-{
- return (__m256i) __builtin_ia32_selectq_256(__M,
- (__v4di) _mm256_set1_epi64x(__A),
- (__v4di) __O) ;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
-{
- return (__m256i) __builtin_ia32_selectq_256(__M,
- (__v4di) _mm256_set1_epi64x(__A),
- (__v4di) _mm256_setzero_si256());
-}
-
-#define _mm_fixupimm_pd(A, B, C, imm) \
- ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_fixupimm_pd(A, U, B, C, imm) \
- ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \
- ((__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \
- (__v2df)(__m128d)(B), \
- (__v2di)(__m128i)(C), \
- (int)(imm), (__mmask8)(U)))
-
-#define _mm256_fixupimm_pd(A, B, C, imm) \
- ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
- (__v4df)(__m256d)(B), \
- (__v4di)(__m256i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \
- ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \
- (__v4df)(__m256d)(B), \
- (__v4di)(__m256i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \
- ((__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \
- (__v4df)(__m256d)(B), \
- (__v4di)(__m256i)(C), \
- (int)(imm), (__mmask8)(U)))
-
-#define _mm_fixupimm_ps(A, B, C, imm) \
- ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_fixupimm_ps(A, U, B, C, imm) \
- ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \
- ((__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \
- (__v4sf)(__m128)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_fixupimm_ps(A, B, C, imm) \
- ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
- (__v8sf)(__m256)(B), \
- (__v8si)(__m256i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \
- ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \
- (__v8sf)(__m256)(B), \
- (__v8si)(__m256i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \
- ((__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \
- (__v8sf)(__m256)(B), \
- (__v8si)(__m256i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
-{
- return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_load_pd (__mmask8 __U, void const *__P)
-{
- return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
-{
- return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
-{
- return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
-{
- return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_load_ps (__mmask8 __U, void const *__P)
-{
- return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
-{
- return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
-{
- return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi64 (void const *__P)
-{
- struct __loadu_epi64 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi64*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
- (__v2di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P,
- (__v2di)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi64 (void const *__P)
-{
- struct __loadu_epi64 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi64*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
- (__v4di) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P,
- (__v4di)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline __m128i __DEFAULT_FN_ATTRS128
-_mm_loadu_epi32 (void const *__P)
-{
- struct __loadu_epi32 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi32*)__P)->__v;
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
- (__v4si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P,
- (__v4si)
- _mm_setzero_si128 (),
- (__mmask8) __U);
-}
-
-static __inline __m256i __DEFAULT_FN_ATTRS256
-_mm256_loadu_epi32 (void const *__P)
-{
- struct __loadu_epi32 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_epi32*)__P)->__v;
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
- (__v8si) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P,
- (__v8si)
- _mm256_setzero_si256 (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
-{
- return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
-{
- return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
-{
- return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
-{
- return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
-{
- return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
-{
- return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
-{
- return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
-{
- return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
-{
- __builtin_ia32_storeapd128_mask ((__v2df *) __P,
- (__v2df) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
-{
- __builtin_ia32_storeapd256_mask ((__v4df *) __P,
- (__v4df) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
-{
- __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
- (__v4sf) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
-{
- __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
- (__v8sf) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi64 (void *__P, __m128i __A)
-{
- struct __storeu_epi64 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi64*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
-{
- __builtin_ia32_storedqudi128_mask ((__v2di *) __P,
- (__v2di) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi64 (void *__P, __m256i __A)
-{
- struct __storeu_epi64 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi64*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
-{
- __builtin_ia32_storedqudi256_mask ((__v4di *) __P,
- (__v4di) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_storeu_epi32 (void *__P, __m128i __A)
-{
- struct __storeu_epi32 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi32*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
-{
- __builtin_ia32_storedqusi128_mask ((__v4si *) __P,
- (__v4si) __A,
- (__mmask8) __U);
-}
-
-static __inline void __DEFAULT_FN_ATTRS256
-_mm256_storeu_epi32 (void *__P, __m256i __A)
-{
- struct __storeu_epi32 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_epi32*)__P)->__v = __A;
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
-{
- __builtin_ia32_storedqusi256_mask ((__v8si *) __P,
- (__v8si) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
-{
- __builtin_ia32_storeupd128_mask ((__v2df *) __P,
- (__v2df) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
-{
- __builtin_ia32_storeupd256_mask ((__v4df *) __P,
- (__v4df) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
-{
- __builtin_ia32_storeups128_mask ((__v4sf *) __P,
- (__v4sf) __A,
- (__mmask8) __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
-{
- __builtin_ia32_storeups256_mask ((__v8sf *) __P,
- (__v8sf) __A,
- (__mmask8) __U);
-}
-
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_unpackhi_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_unpackhi_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_unpackhi_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_unpackhi_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_unpackhi_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_unpackhi_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_unpackhi_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_unpackhi_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_unpacklo_pd(__A, __B),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_unpacklo_pd(__A, __B),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_unpacklo_pd(__A, __B),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_unpacklo_pd(__A, __B),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_unpacklo_ps(__A, __B),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_unpacklo_ps(__A, __B),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_unpacklo_ps(__A, __B),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_unpacklo_ps(__A, __B),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rcp14_pd (__m128d __A)
-{
- return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
- return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
-{
- return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_rcp14_pd (__m256d __A)
-{
- return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
- return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
-{
- return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rcp14_ps (__m128 __A)
-{
- return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
- return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
-{
- return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_rcp14_ps (__m256 __A)
-{
- return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
- return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
-{
- return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-#define _mm_mask_permute_pd(W, U, X, C) \
- ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm_permute_pd((X), (C)), \
- (__v2df)(__m128d)(W)))
-
-#define _mm_maskz_permute_pd(U, X, C) \
- ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm_permute_pd((X), (C)), \
- (__v2df)_mm_setzero_pd()))
-
-#define _mm256_mask_permute_pd(W, U, X, C) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_permute_pd((X), (C)), \
- (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_permute_pd(U, X, C) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_permute_pd((X), (C)), \
- (__v4df)_mm256_setzero_pd()))
-
-#define _mm_mask_permute_ps(W, U, X, C) \
- ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm_permute_ps((X), (C)), \
- (__v4sf)(__m128)(W)))
-
-#define _mm_maskz_permute_ps(U, X, C) \
- ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm_permute_ps((X), (C)), \
- (__v4sf)_mm_setzero_ps()))
-
-#define _mm256_mask_permute_ps(W, U, X, C) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_permute_ps((X), (C)), \
- (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_permute_ps(U, X, C) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_permute_ps((X), (C)), \
- (__v8sf)_mm256_setzero_ps()))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_permutevar_pd(__A, __C),
- (__v2df)__W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C)
-{
- return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U,
- (__v2df)_mm_permutevar_pd(__A, __C),
- (__v2df)_mm_setzero_pd());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_permutevar_pd(__A, __C),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_permutevar_pd(__A, __C),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_permutevar_ps(__A, __C),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_permutevar_ps(__A, __C),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_permutevar_ps(__A, __C),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_permutevar_ps(__A, __C),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_test_epi32_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B),
- _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_test_epi32_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_test_epi64_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B),
- _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_test_epi64_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_testn_epi32_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B),
- _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_testn_epi64_mask (__m128i __A, __m128i __B)
-{
- return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS128
-_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B),
- _mm_setzero_si128());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
-{
- return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __mmask8 __DEFAULT_FN_ATTRS256
-_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B),
- _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_unpackhi_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_unpackhi_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_unpackhi_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_unpackhi_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_unpackhi_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_unpackhi_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_unpackhi_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_unpackhi_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_unpacklo_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_unpacklo_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_unpacklo_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_unpacklo_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_unpacklo_epi64(__A, __B),
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U,
- (__v2di)_mm_unpacklo_epi64(__A, __B),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_unpacklo_epi64(__A, __B),
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U,
- (__v4di)_mm256_unpacklo_epi64(__A, __B),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sra_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_sra_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sra_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_sra_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srai_epi32(__A, __B),
- (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U,
- (__v4si)_mm_srai_epi32(__A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srai_epi32(__A, __B),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U,
- (__v8si)_mm256_srai_epi32(__A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_sra_epi64(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
- (__v2di)_mm_sra_epi64(__A, __B), \
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
- (__v2di)_mm_sra_epi64(__A, __B), \
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi64(__m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
- (__v4di)_mm256_sra_epi64(__A, __B), \
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
- (__v4di)_mm256_sra_epi64(__A, __B), \
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_srai_epi64(__m128i __A, unsigned int __imm)
-{
- return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
- (__v2di)_mm_srai_epi64(__A, __imm), \
- (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm)
-{
- return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \
- (__v2di)_mm_srai_epi64(__A, __imm), \
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi64(__m256i __A, unsigned int __imm)
-{
- return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A,
- unsigned int __imm)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
- (__v4di)_mm256_srai_epi64(__A, __imm), \
- (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \
- (__v4di)_mm256_srai_epi64(__A, __imm), \
- (__v4di)_mm256_setzero_si256());
-}
-
-#define _mm_ternarylogic_epi32(A, B, C, imm) \
- ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \
- ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \
- ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), \
- (__v4si)(__m128i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_ternarylogic_epi32(A, B, C, imm) \
- ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), \
- (__v8si)(__m256i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \
- ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), \
- (__v8si)(__m256i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \
- ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), \
- (__v8si)(__m256i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_ternarylogic_epi64(A, B, C, imm) \
- ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \
- ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \
- ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), \
- (__v2di)(__m128i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_ternarylogic_epi64(A, B, C, imm) \
- ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), \
- (__v4di)(__m256i)(C), (int)(imm), \
- (__mmask8)-1))
-
-#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \
- ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), \
- (__v4di)(__m256i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \
- ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), \
- (__v4di)(__m256i)(C), (int)(imm), \
- (__mmask8)(U)))
-
-
-
-#define _mm256_shuffle_f32x4(A, B, imm) \
- ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
- (__v8sf)(__m256)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
- (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \
- (__v8sf)_mm256_setzero_ps()))
-
-#define _mm256_shuffle_f64x2(A, B, imm) \
- ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \
- (__v4df)(__m256d)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
- (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \
- (__v4df)_mm256_setzero_pd()))
-
-#define _mm256_shuffle_i32x4(A, B, imm) \
- ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
- (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm256_shuffle_i64x2(A, B, imm) \
- ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
- (__v4di)(__m256i)(W)))
-
-
-#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \
- (__v4di)_mm256_setzero_si256()))
-
-#define _mm_mask_shuffle_pd(W, U, A, B, M) \
- ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm_shuffle_pd((A), (B), (M)), \
- (__v2df)(__m128d)(W)))
-
-#define _mm_maskz_shuffle_pd(U, A, B, M) \
- ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \
- (__v2df)_mm_shuffle_pd((A), (B), (M)), \
- (__v2df)_mm_setzero_pd()))
-
-#define _mm256_mask_shuffle_pd(W, U, A, B, M) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
- (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_shuffle_pd(U, A, B, M) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_shuffle_pd((A), (B), (M)), \
- (__v4df)_mm256_setzero_pd()))
-
-#define _mm_mask_shuffle_ps(W, U, A, B, M) \
- ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
- (__v4sf)(__m128)(W)))
-
-#define _mm_maskz_shuffle_ps(U, A, B, M) \
- ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
- (__v4sf)_mm_shuffle_ps((A), (B), (M)), \
- (__v4sf)_mm_setzero_ps()))
-
-#define _mm256_mask_shuffle_ps(W, U, A, B, M) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
- (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_shuffle_ps(U, A, B, M) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \
- (__v8sf)_mm256_setzero_ps()))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_pd (__m128d __A)
-{
- return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
- return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
- (__v2df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
-{
- return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
- (__v2df)
- _mm_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_rsqrt14_pd (__m256d __A)
-{
- return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
- return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
- (__v4df) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
-{
- return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
- (__v4df)
- _mm256_setzero_pd (),
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_rsqrt14_ps (__m128 __A)
-{
- return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
- return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
-{
- return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_rsqrt14_ps (__m256 __A)
-{
- return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) -1);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
- return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
-{
- return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_broadcast_f32x4(__m128 __A)
-{
- return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
- 0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
- (__v8sf)_mm256_broadcast_f32x4(__A),
- (__v8sf)__O);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__M,
- (__v8sf)_mm256_broadcast_f32x4(__A),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_broadcast_i32x4(__m128i __A)
-{
- return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
- 0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_broadcast_i32x4(__A),
- (__v8si)__O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_broadcast_i32x4(__A),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
-{
- return (__m256d)__builtin_ia32_selectpd_256(__M,
- (__v4df) _mm256_broadcastsd_pd(__A),
- (__v4df) __O);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
-{
- return (__m256d)__builtin_ia32_selectpd_256(__M,
- (__v4df) _mm256_broadcastsd_pd(__A),
- (__v4df) _mm256_setzero_pd());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
-{
- return (__m128)__builtin_ia32_selectps_128(__M,
- (__v4sf) _mm_broadcastss_ps(__A),
- (__v4sf) __O);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
- return (__m128)__builtin_ia32_selectps_128(__M,
- (__v4sf) _mm_broadcastss_ps(__A),
- (__v4sf) _mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
-{
- return (__m256)__builtin_ia32_selectps_256(__M,
- (__v8sf) _mm256_broadcastss_ps(__A),
- (__v8sf) __O);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
-{
- return (__m256)__builtin_ia32_selectps_256(__M,
- (__v8sf) _mm256_broadcastss_ps(__A),
- (__v8sf) _mm256_setzero_ps());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128(__M,
- (__v4si) _mm_broadcastd_epi32(__A),
- (__v4si) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128(__M,
- (__v4si) _mm_broadcastd_epi32(__A),
- (__v4si) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256(__M,
- (__v8si) _mm256_broadcastd_epi32(__A),
- (__v8si) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectd_256(__M,
- (__v8si) _mm256_broadcastd_epi32(__A),
- (__v8si) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectq_128(__M,
- (__v2di) _mm_broadcastq_epi64(__A),
- (__v2di) __O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
- return (__m128i)__builtin_ia32_selectq_128(__M,
- (__v2di) _mm_broadcastq_epi64(__A),
- (__v2di) _mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256(__M,
- (__v4di) _mm256_broadcastq_epi64(__A),
- (__v4di) __O);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
-{
- return (__m256i)__builtin_ia32_selectq_256(__M,
- (__v4di) _mm256_broadcastq_epi64(__A),
- (__v4di) _mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi32_epi8 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi32_epi8 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi32_epi16 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
- (__v8hi)_mm_setzero_si128 (),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
- (__v8hi)__O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi32_epi16 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi64_epi8 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi64_epi8 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi64_epi32 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
- (__v4si)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
- (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
- (__v4si) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi64_epi32 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
- (__v4si)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
- (__v4si)__O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
- (__v4si) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtsepi64_epi16 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtsepi64_epi16 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi32_epi8 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi32_epi8 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi32_epi16 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi32_epi16 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
- (__v8hi) _mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi64_epi8 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi64_epi8 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
- (__v16qi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
- (__v16qi) __O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi64_epi32 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
- (__v4si)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
- (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
- (__v4si) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi64_epi32 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
- (__v4si)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
- (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
- (__v4si) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtusepi64_epi16 (__m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtusepi64_epi16 (__m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
- (__v8hi)_mm_undefined_si128(),
- (__mmask8) -1);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi8 (__m128i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
- 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
- (__v16qi)
- _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi8 (__m256i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v8si)__A, __v8qi),
- (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
- 12, 13, 14, 15);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi32_epi16 (__m128i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
- 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi32_epi16 (__m256i __A)
-{
- return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi8 (__m128i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3,
- 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi8 (__m256i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1,
- 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
- (__v16qi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
- (__v16qi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi32 (__m128i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
- (__v4si) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
- (__v4si) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi32 (__m256i __A)
-{
- return (__m128i)__builtin_convertvector((__v4di)__A, __v4si);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm256_cvtepi64_epi32(__A),
- (__v4si)__O);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
-{
- return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M,
- (__v4si)_mm256_cvtepi64_epi32(__A),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_cvtepi64_epi16 (__m128i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3,
- 3, 3, 3, 3);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
- (__v8hi)__O,
- __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
-{
- return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
-{
- __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_cvtepi64_epi16 (__m256i __A)
-{
- return (__m128i)__builtin_shufflevector(
- __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1,
- 2, 3, 4, 5, 6, 7);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
- (__v8hi) __O, __M);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
-{
- return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
- (__v8hi) _mm_setzero_si128 (),
- __M);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
-{
- __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M);
-}
-
-#define _mm256_extractf32x4_ps(A, imm) \
- ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
- (int)(imm), \
- (__v4sf)_mm_undefined_ps(), \
- (__mmask8)-1))
-
-#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \
- ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
- (int)(imm), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_extractf32x4_ps(U, A, imm) \
- ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \
- (int)(imm), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm256_extracti32x4_epi32(A, imm) \
- ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
- (int)(imm), \
- (__v4si)_mm_undefined_si128(), \
- (__mmask8)-1))
-
-#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \
- ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
- (int)(imm), \
- (__v4si)(__m128i)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \
- ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \
- (int)(imm), \
- (__v4si)_mm_setzero_si128(), \
- (__mmask8)(U)))
-
-#define _mm256_insertf32x4(A, B, imm) \
- ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \
- (__v4sf)(__m128)(B), (int)(imm)))
-
-#define _mm256_mask_insertf32x4(W, U, A, B, imm) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
- (__v8sf)(__m256)(W)))
-
-#define _mm256_maskz_insertf32x4(U, A, B, imm) \
- ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
- (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \
- (__v8sf)_mm256_setzero_ps()))
-
-#define _mm256_inserti32x4(A, B, imm) \
- ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \
- (__v4si)(__m128i)(B), (int)(imm)))
-
-#define _mm256_mask_inserti32x4(W, U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
- (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_inserti32x4(U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_inserti32x4((A), (B), (imm)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm_getmant_pd(A, B, C) \
- ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)-1))
-
-#define _mm_mask_getmant_pd(W, U, A, B, C) \
- ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v2df)(__m128d)(W), \
- (__mmask8)(U)))
-
-#define _mm_maskz_getmant_pd(U, A, B, C) \
- ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v2df)_mm_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm256_getmant_pd(A, B, C) \
- ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)-1))
-
-#define _mm256_mask_getmant_pd(W, U, A, B, C) \
- ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v4df)(__m256d)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_getmant_pd(U, A, B, C) \
- ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \
- (int)(((C)<<2) | (B)), \
- (__v4df)_mm256_setzero_pd(), \
- (__mmask8)(U)))
-
-#define _mm_getmant_ps(A, B, C) \
- ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
- (int)(((C)<<2) | (B)), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)-1))
-
-#define _mm_mask_getmant_ps(W, U, A, B, C) \
- ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
- (int)(((C)<<2) | (B)), \
- (__v4sf)(__m128)(W), \
- (__mmask8)(U)))
-
-#define _mm_maskz_getmant_ps(U, A, B, C) \
- ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \
- (int)(((C)<<2) | (B)), \
- (__v4sf)_mm_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm256_getmant_ps(A, B, C) \
- ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)-1))
-
-#define _mm256_mask_getmant_ps(W, U, A, B, C) \
- ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8sf)(__m256)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_getmant_ps(U, A, B, C) \
- ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \
- (int)(((C)<<2) | (B)), \
- (__v8sf)_mm256_setzero_ps(), \
- (__mmask8)(U)))
-
-#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
- ((__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \
- (void const *)(addr), \
- (__v2di)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
- ((__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \
- (void const *)(addr), \
- (__v2di)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \
- ((__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \
- (void const *)(addr), \
- (__v4di)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \
- ((__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \
- (void const *)(addr), \
- (__v4di)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
- ((__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \
- (void const *)(addr), \
- (__v2di)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
- ((__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \
- (void const *)(addr), \
- (__v2di)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \
- ((__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \
- (void const *)(addr), \
- (__v4di)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \
- ((__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \
- (void const *)(addr), \
- (__v4di)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
- ((__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \
- (void const *)(addr), \
- (__v4si)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
- ((__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \
- (void const *)(addr), \
- (__v4si)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \
- ((__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \
- (void const *)(addr), \
- (__v4si)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \
- ((__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \
- (void const *)(addr), \
- (__v4si)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
- ((__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \
- (void const *)(addr), \
- (__v4si)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
- ((__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \
- (void const *)(addr), \
- (__v4si)(__m128i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \
- ((__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \
- (void const *)(addr), \
- (__v8si)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \
- ((__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \
- (void const *)(addr), \
- (__v8si)(__m256i)(index), \
- (__mmask8)(mask), (int)(scale)))
-
-#define _mm256_permutex_pd(X, C) \
- ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C)))
-
-#define _mm256_mask_permutex_pd(W, U, X, C) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_permutex_pd((X), (C)), \
- (__v4df)(__m256d)(W)))
-
-#define _mm256_maskz_permutex_pd(U, X, C) \
- ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
- (__v4df)_mm256_permutex_pd((X), (C)), \
- (__v4df)_mm256_setzero_pd()))
-
-#define _mm256_permutex_epi64(X, C) \
- ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C)))
-
-#define _mm256_mask_permutex_epi64(W, U, X, C) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_permutex_epi64((X), (C)), \
- (__v4di)(__m256i)(W)))
-
-#define _mm256_maskz_permutex_epi64(U, X, C) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_permutex_epi64((X), (C)), \
- (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
-{
- return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
- __m256d __Y)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_permutexvar_pd(__X, __Y),
- (__v4df)__W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
-{
- return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U,
- (__v4df)_mm256_permutexvar_pd(__X, __Y),
- (__v4df)_mm256_setzero_pd());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_permutexvar_epi64(__X, __Y),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
- __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M,
- (__v4di)_mm256_permutexvar_epi64(__X, __Y),
- (__v4di)__W);
-}
-
-#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A))
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_permutexvar_ps(__X, __Y),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_permutexvar_ps(__X, __Y),
- (__v8sf)_mm256_setzero_ps());
-}
-
-#define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X,
- __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_permutexvar_epi32(__X, __Y),
- (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y)
-{
- return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M,
- (__v8si)_mm256_permutexvar_epi32(__X, __Y),
- (__v8si)_mm256_setzero_si256());
-}
-
-#define _mm_alignr_epi32(A, B, imm) \
- ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (int)(imm)))
-
-#define _mm_mask_alignr_epi32(W, U, A, B, imm) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
- (__v4si)(__m128i)(W)))
-
-#define _mm_maskz_alignr_epi32(U, A, B, imm) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_alignr_epi32((A), (B), (imm)), \
- (__v4si)_mm_setzero_si128()))
-
-#define _mm256_alignr_epi32(A, B, imm) \
- ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
- (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_alignr_epi32(U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm_alignr_epi64(A, B, imm) \
- ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (int)(imm)))
-
-#define _mm_mask_alignr_epi64(W, U, A, B, imm) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
- (__v2di)(__m128i)(W)))
-
-#define _mm_maskz_alignr_epi64(U, A, B, imm) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm_alignr_epi64((A), (B), (imm)), \
- (__v2di)_mm_setzero_si128()))
-
-#define _mm256_alignr_epi64(A, B, imm) \
- ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), (int)(imm)))
-
-#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
- (__v4di)(__m256i)(W)))
-
-#define _mm256_maskz_alignr_epi64(U, A, B, imm) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \
- (__v4di)_mm256_setzero_si256()))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_movehdup_ps(__A),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_movehdup_ps(__A),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_movehdup_ps(__A),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_movehdup_ps(__A),
- (__v8sf)_mm256_setzero_ps());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_moveldup_ps(__A),
- (__v4sf)__W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
-{
- return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
- (__v4sf)_mm_moveldup_ps(__A),
- (__v4sf)_mm_setzero_ps());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_moveldup_ps(__A),
- (__v8sf)__W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
-{
- return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
- (__v8sf)_mm256_moveldup_ps(__A),
- (__v8sf)_mm256_setzero_ps());
-}
-
-#define _mm256_mask_shuffle_epi32(W, U, A, I) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shuffle_epi32((A), (I)), \
- (__v8si)(__m256i)(W)))
-
-#define _mm256_maskz_shuffle_epi32(U, A, I) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shuffle_epi32((A), (I)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm_mask_shuffle_epi32(W, U, A, I) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_shuffle_epi32((A), (I)), \
- (__v4si)(__m128i)(W)))
-
-#define _mm_maskz_shuffle_epi32(U, A, I) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_shuffle_epi32((A), (I)), \
- (__v4si)_mm_setzero_si128()))
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
-{
- return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
- (__v2df) __A,
- (__v2df) __W);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
-{
- return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U,
- (__v2df) __A,
- (__v2df) _mm_setzero_pd ());
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
-{
- return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
- (__v4df) __A,
- (__v4df) __W);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
-{
- return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U,
- (__v4df) __A,
- (__v4df) _mm256_setzero_pd ());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
-{
- return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
- (__v4sf) __A,
- (__v4sf) __W);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
-{
- return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U,
- (__v4sf) __A,
- (__v4sf) _mm_setzero_ps ());
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
-{
- return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
- (__v8sf) __A,
- (__v8sf) __W);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
-{
- return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U,
- (__v8sf) __A,
- (__v8sf) _mm256_setzero_ps ());
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
-{
- return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
- (__v4sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
-{
- return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
- (__v4sf)
- _mm_setzero_ps (),
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
-{
- return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
- (__v8sf) __W,
- (__mmask8) __U);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
-{
- return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
- (__v8sf)
- _mm256_setzero_ps (),
- (__mmask8) __U);
-}
-
-#define _mm_mask_cvt_roundps_ph(W, U, A, I) \
- ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
- (__v8hi)(__m128i)(W), \
- (__mmask8)(U)))
-
-#define _mm_maskz_cvt_roundps_ph(U, A, I) \
- ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \
- (__v8hi)_mm_setzero_si128(), \
- (__mmask8)(U)))
-
-#define _mm_mask_cvtps_ph _mm_mask_cvt_roundps_ph
-#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph
-
-#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \
- ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
- (__v8hi)(__m128i)(W), \
- (__mmask8)(U)))
-
-#define _mm256_maskz_cvt_roundps_ph(U, A, I) \
- ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \
- (__v8hi)_mm_setzero_si128(), \
- (__mmask8)(U)))
-
-#define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph
-#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __AVX512VLINTRIN_H */
+++ /dev/null
-/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvbmi2intrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLVBMI2INTRIN_H
-#define __AVX512VLVBMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
- (__v8hi) __S,
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D,
- (__v8hi) _mm_setzero_si128(),
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
- (__v16qi) __S,
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D,
- (__v16qi) _mm_setzero_si128(),
- __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D)
-{
- __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D,
- __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D)
-{
- __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D,
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
- (__v8hi) __S,
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D,
- (__v8hi) _mm_setzero_si128(),
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
- (__v16qi) __S,
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D)
-{
- return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D,
- (__v16qi) _mm_setzero_si128(),
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
- (__v8hi) __S,
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P,
- (__v8hi) _mm_setzero_si128(),
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
- (__v16qi) __S,
- __U);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P)
-{
- return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P,
- (__v16qi) _mm_setzero_si128(),
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
- (__v16hi) __S,
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D,
- (__v16hi) _mm256_setzero_si256(),
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
- (__v32qi) __S,
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D,
- (__v32qi) _mm256_setzero_si256(),
- __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D)
-{
- __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D,
- __U);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D)
-{
- __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D,
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
- (__v16hi) __S,
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D,
- (__v16hi) _mm256_setzero_si256(),
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
- (__v32qi) __S,
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D)
-{
- return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D,
- (__v32qi) _mm256_setzero_si256(),
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
- (__v16hi) __S,
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P,
- (__v16hi) _mm256_setzero_si256(),
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
- (__v32qi) __S,
- __U);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P)
-{
- return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P,
- (__v32qi) _mm256_setzero_si256(),
- __U);
-}
-
-#define _mm256_shldi_epi64(A, B, I) \
- ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi64(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
- (__v4di)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi64(U, A, B, I) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_shldi_epi64((A), (B), (I)), \
- (__v4di)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi64(A, B, I) \
- ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi64(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm_shldi_epi64((A), (B), (I)), \
- (__v2di)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi64(U, A, B, I) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm_shldi_epi64((A), (B), (I)), \
- (__v2di)_mm_setzero_si128()))
-
-#define _mm256_shldi_epi32(A, B, I) \
- ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi32(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
- (__v8si)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi32(U, A, B, I) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shldi_epi32((A), (B), (I)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi32(A, B, I) \
- ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi32(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_shldi_epi32((A), (B), (I)), \
- (__v4si)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi32(U, A, B, I) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_shldi_epi32((A), (B), (I)), \
- (__v4si)_mm_setzero_si128()))
-
-#define _mm256_shldi_epi16(A, B, I) \
- ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \
- (__v16hi)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shldi_epi16(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
- (__v16hi)(__m256i)(S)))
-
-#define _mm256_maskz_shldi_epi16(U, A, B, I) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \
- (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_shldi_epi16(A, B, I) \
- ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \
- (__v8hi)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shldi_epi16(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
- (__v8hi)(__m128i)(S)))
-
-#define _mm_maskz_shldi_epi16(U, A, B, I) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shldi_epi16((A), (B), (I)), \
- (__v8hi)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi64(A, B, I) \
- ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
- (__v4di)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi64(U, A, B, I) \
- ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
- (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \
- (__v4di)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi64(A, B, I) \
- ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi64(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
- (__v2di)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi64(U, A, B, I) \
- ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \
- (__v2di)_mm_shrdi_epi64((A), (B), (I)), \
- (__v2di)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi32(A, B, I) \
- ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \
- (__v8si)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
- (__v8si)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi32(U, A, B, I) \
- ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
- (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \
- (__v8si)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi32(A, B, I) \
- ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi32(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
- (__v4si)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi32(U, A, B, I) \
- ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
- (__v4si)_mm_shrdi_epi32((A), (B), (I)), \
- (__v4si)_mm_setzero_si128()))
-
-#define _mm256_shrdi_epi16(A, B, I) \
- ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \
- (__v16hi)(__m256i)(B), (int)(I)))
-
-#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
- (__v16hi)(__m256i)(S)))
-
-#define _mm256_maskz_shrdi_epi16(U, A, B, I) \
- ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \
- (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \
- (__v16hi)_mm256_setzero_si256()))
-
-#define _mm_shrdi_epi16(A, B, I) \
- ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \
- (__v8hi)(__m128i)(B), (int)(I)))
-
-#define _mm_mask_shrdi_epi16(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
- (__v8hi)(__m128i)(S)))
-
-#define _mm_maskz_shrdi_epi16(U, A, B, I) \
- ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \
- (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \
- (__v8hi)_mm_setzero_si128()))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B,
- (__v4di)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_shldv_epi64(__A, __B, __C),
- (__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_shldv_epi64(__A, __B, __C),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B,
- (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_shldv_epi64(__A, __B, __C),
- (__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_shldv_epi64(__A, __B, __C),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B,
- (__v8si)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_shldv_epi32(__A, __B, __C),
- (__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_shldv_epi32(__A, __B, __C),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B,
- (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_shldv_epi32(__A, __B, __C),
- (__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_shldv_epi32(__A, __B, __C),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B,
- (__v16hi)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectw_256(__U,
- (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
- (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectw_256(__U,
- (__v16hi)_mm256_shldv_epi16(__A, __B, __C),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B,
- (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectw_128(__U,
- (__v8hi)_mm_shldv_epi16(__A, __B, __C),
- (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectw_128(__U,
- (__v8hi)_mm_shldv_epi16(__A, __B, __C),
- (__v8hi)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B,
- (__v4di)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
- (__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectq_256(__U,
- (__v4di)_mm256_shrdv_epi64(__A, __B, __C),
- (__v4di)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B,
- (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_shrdv_epi64(__A, __B, __C),
- (__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectq_128(__U,
- (__v2di)_mm_shrdv_epi64(__A, __B, __C),
- (__v2di)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B,
- (__v8si)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
- (__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_shrdv_epi32(__A, __B, __C),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B,
- (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_shrdv_epi32(__A, __B, __C),
- (__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_shrdv_epi32(__A, __B, __C),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B,
- (__v16hi)__C);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectw_256(__U,
- (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
- (__v16hi)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)__builtin_ia32_selectw_256(__U,
- (__v16hi)_mm256_shrdv_epi16(__A, __B, __C),
- (__v16hi)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B,
- (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectw_128(__U,
- (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
- (__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_selectw_128(__U,
- (__v8hi)_mm_shrdv_epi16(__A, __B, __C),
- (__v8hi)_mm_setzero_si128());
-}
-
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VLVNNIINTRIN_H
-#define __AVX512VLVNNIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-#define _mm256_dpbusd_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-#define _mm256_dpbusds_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-#define _mm256_dpwssd_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-#define _mm256_dpwssds_epi32(S, A, B) \
- ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-#define _mm_dpbusd_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
-/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
-/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-#define _mm_dpbusds_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-#define _mm_dpwssd_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
-/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
-/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
-/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-#define _mm_dpwssds_epi32(S, A, B) \
- ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
- (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
- (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
- (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
- (__v8si)__S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_selectd_256(__U,
- (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
- (__v8si)_mm256_setzero_si256());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
- (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
- (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
- (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
- (__v4si)__S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_selectd_128(__U,
- (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
- (__v4si)_mm_setzero_si128());
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vlvp2intersectintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _AVX512VLVP2INTERSECT_H
-#define _AVX512VLVP2INTERSECT_H
-
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \
- __min_vector_width__(128)))
-
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \
- __min_vector_width__(256)))
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x i32].
-/// \param __b
-/// A 256-bit vector of [8 x i32]
-/// \param __m0
-/// A pointer point to 8-bit mask
-/// \param __m1
-/// A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
- __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x i64].
-/// \param __b
-/// A 256-bit vector of [4 x i64]
-/// \param __m0
-/// A pointer point to 8-bit mask
-/// \param __m1
-/// A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) {
- __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x i32].
-/// \param __b
-/// A 128-bit vector of [4 x i32]
-/// \param __m0
-/// A pointer point to 8-bit mask
-/// \param __m1
-/// A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
- __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x i64].
-/// \param __b
-/// A 128-bit vector of [2 x i64]
-/// \param __m0
-/// A pointer point to 8-bit mask
-/// \param __m1
-/// A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) {
- __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VNNIINTRIN_H
-#define __AVX512VNNIINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512)))
-
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
- (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpbusd_epi32(__S, __A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
- (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpbusds_epi32(__S, __A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
- (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpwssd_epi32(__S, __A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
- (__v16si)__B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
- (__v16si)__S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
-{
- return (__m512i)__builtin_ia32_selectd_512(__U,
- (__v16si)_mm512_dpwssds_epi32(__S, __A, __B),
- (__v16si)_mm512_setzero_si512());
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avx512vp2intersect.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _AVX512VP2INTERSECT_H
-#define _AVX512VP2INTERSECT_H
-
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512vp2intersect"), \
- __min_vector_width__(512)))
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between dwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTD </c> instruction.
-///
-/// \param __a
-/// A 512-bit vector of [16 x i32].
-/// \param __b
-/// A 512-bit vector of [16 x i32]
-/// \param __m0
-/// A pointer point to 16-bit mask
-/// \param __m1
-/// A pointer point to 16-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) {
- __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1);
-}
-
-/// Store, in an even/odd pair of mask registers, the indicators of the
-/// locations of value matches between quadwords in operands __a and __b.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VP2INTERSECTQ </c> instruction.
-///
-/// \param __a
-/// A 512-bit vector of [8 x i64].
-/// \param __b
-/// A 512-bit vector of [8 x i64]
-/// \param __m0
-/// A pointer point to 8-bit mask
-/// \param __m1
-/// A pointer point to 8-bit mask
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) {
- __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error \
- "Never use <avx512vpopcntdqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VPOPCNTDQINTRIN_H
-#define __AVX512VPOPCNTDQINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512)))
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) {
- return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) {
- return (__m512i)__builtin_ia32_selectq_512(
- (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) {
- return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) {
- return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) {
- return (__m512i)__builtin_ia32_selectd_512(
- (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS
-_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) {
- return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error \
- "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVX512VPOPCNTDQVLINTRIN_H
-#define __AVX512VPOPCNTDQVLINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi64(__m128i __A) {
- return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i)__builtin_ia32_selectq_128(
- (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) {
- return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_popcnt_epi32(__m128i __A) {
- return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) {
- return (__m128i)__builtin_ia32_selectd_128(
- (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) {
- return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi64(__m256i __A) {
- return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i)__builtin_ia32_selectq_256(
- (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) {
- return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_popcnt_epi32(__m256i __A) {
- return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) {
- return (__m256i)__builtin_ia32_selectd_256(
- (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) {
- return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif
+++ /dev/null
-/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXINTRIN_H
-#define __AVXINTRIN_H
-
-typedef double __v4df __attribute__ ((__vector_size__ (32)));
-typedef float __v8sf __attribute__ ((__vector_size__ (32)));
-typedef long long __v4di __attribute__ ((__vector_size__ (32)));
-typedef int __v8si __attribute__ ((__vector_size__ (32)));
-typedef short __v16hi __attribute__ ((__vector_size__ (32)));
-typedef char __v32qi __attribute__ ((__vector_size__ (32)));
-
-/* Unsigned types */
-typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
-typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
-typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
-typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
-
-/* We need an explicitly signed variant for char. Note that this shouldn't
- * appear in the interface though. */
-typedef signed char __v32qs __attribute__((__vector_size__(32)));
-
-typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
-typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
-typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
-
-typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
-typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
-typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
-
-#if (__clang_major__ > 15)
-#ifdef __SSE2__
-/* Both _Float16 and __bf16 require SSE2 being enabled. */
-typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
-
-typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
-typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
-#endif
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
-
-/* Arithmetic */
-/// Adds two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the sums of both
-/// operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_add_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)((__v4df)__a+(__v4df)__b);
-}
-
-/// Adds two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the sums of both
-/// operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_add_ps(__m256 __a, __m256 __b)
-{
- return (__m256)((__v8sf)__a+(__v8sf)__b);
-}
-
-/// Subtracts two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the minuend.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing the subtrahend.
-/// \returns A 256-bit vector of [4 x double] containing the differences between
-/// both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sub_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)((__v4df)__a-(__v4df)__b);
-}
-
-/// Subtracts two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the minuend.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing the subtrahend.
-/// \returns A 256-bit vector of [8 x float] containing the differences between
-/// both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sub_ps(__m256 __a, __m256 __b)
-{
- return (__m256)((__v8sf)__a-(__v8sf)__b);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-/// two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the left source operand.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing the right source operand.
-/// \returns A 256-bit vector of [4 x double] containing the alternating sums
-/// and differences between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_addsub_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-/// two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the left source operand.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing the right source operand.
-/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
-/// differences between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_addsub_ps(__m256 __a, __m256 __b)
-{
- return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Divides two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the dividend.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing the divisor.
-/// \returns A 256-bit vector of [4 x double] containing the quotients of both
-/// operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_div_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)((__v4df)__a/(__v4df)__b);
-}
-
-/// Divides two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the dividend.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing the divisor.
-/// \returns A 256-bit vector of [8 x float] containing the quotients of both
-/// operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_div_ps(__m256 __a, __m256 __b)
-{
- return (__m256)((__v8sf)__a/(__v8sf)__b);
-}
-
-/// Compares two 256-bit vectors of [4 x double] and returns the greater
-/// of each pair of values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the operands.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the operands.
-/// \returns A 256-bit vector of [4 x double] containing the maximum values
-/// between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_max_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Compares two 256-bit vectors of [8 x float] and returns the greater
-/// of each pair of values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the operands.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the operands.
-/// \returns A 256-bit vector of [8 x float] containing the maximum values
-/// between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_max_ps(__m256 __a, __m256 __b)
-{
- return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Compares two 256-bit vectors of [4 x double] and returns the lesser
-/// of each pair of values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the operands.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the operands.
-/// \returns A 256-bit vector of [4 x double] containing the minimum values
-/// between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_min_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Compares two 256-bit vectors of [8 x float] and returns the lesser
-/// of each pair of values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the operands.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the operands.
-/// \returns A 256-bit vector of [8 x float] containing the minimum values
-/// between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_min_ps(__m256 __a, __m256 __b)
-{
- return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Multiplies two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the operands.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the operands.
-/// \returns A 256-bit vector of [4 x double] containing the products of both
-/// operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_mul_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)((__v4df)__a * (__v4df)__b);
-}
-
-/// Multiplies two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the operands.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the operands.
-/// \returns A 256-bit vector of [8 x float] containing the products of both
-/// operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_mul_ps(__m256 __a, __m256 __b)
-{
- return (__m256)((__v8sf)__a * (__v8sf)__b);
-}
-
-/// Calculates the square roots of the values in a 256-bit vector of
-/// [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the square roots of the
-/// values in the operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_sqrt_pd(__m256d __a)
-{
- return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
-}
-
-/// Calculates the square roots of the values in a 256-bit vector of
-/// [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the square roots of the
-/// values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_sqrt_ps(__m256 __a)
-{
- return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
-}
-
-/// Calculates the reciprocal square roots of the values in a 256-bit
-/// vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
-/// roots of the values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_rsqrt_ps(__m256 __a)
-{
- return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
-}
-
-/// Calculates the reciprocals of the values in a 256-bit vector of
-/// [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
-/// values in the operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_rcp_ps(__m256 __a)
-{
- return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
-}
-
-/// Rounds the values in a 256-bit vector of [4 x double] as specified
-/// by the byte operand. The source values are rounded to integer values and
-/// returned as 64-bit double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_round_pd(__m256d V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [4 x double].
-/// \param M
-/// An integer value that specifies the rounding operation. \n
-/// Bits [7:4] are reserved. \n
-/// Bit [3] is a precision exception value: \n
-/// 0: A normal PE exception is used. \n
-/// 1: The PE field is not updated. \n
-/// Bit [2] is the rounding control source: \n
-/// 0: Use bits [1:0] of \a M. \n
-/// 1: Use the current MXCSR setting. \n
-/// Bits [1:0] contain the rounding control definition: \n
-/// 00: Nearest. \n
-/// 01: Downward (toward negative infinity). \n
-/// 10: Upward (toward positive infinity). \n
-/// 11: Truncated.
-/// \returns A 256-bit vector of [4 x double] containing the rounded values.
-#define _mm256_round_pd(V, M) \
- ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
-
-/// Rounds the values stored in a 256-bit vector of [8 x float] as
-/// specified by the byte operand. The source values are rounded to integer
-/// values and returned as floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_round_ps(__m256 V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [8 x float].
-/// \param M
-/// An integer value that specifies the rounding operation. \n
-/// Bits [7:4] are reserved. \n
-/// Bit [3] is a precision exception value: \n
-/// 0: A normal PE exception is used. \n
-/// 1: The PE field is not updated. \n
-/// Bit [2] is the rounding control source: \n
-/// 0: Use bits [1:0] of \a M. \n
-/// 1: Use the current MXCSR setting. \n
-/// Bits [1:0] contain the rounding control definition: \n
-/// 00: Nearest. \n
-/// 01: Downward (toward negative infinity). \n
-/// 10: Upward (toward positive infinity). \n
-/// 11: Truncated.
-/// \returns A 256-bit vector of [8 x float] containing the rounded values.
-#define _mm256_round_ps(V, M) \
- ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
-
-/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
-/// source values are rounded up to integer values and returned as 64-bit
-/// double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_ceil_pd(__m256d V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
-#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
-
-/// Rounds down the values stored in a 256-bit vector of [4 x double].
-/// The source values are rounded down to integer values and returned as
-/// 64-bit double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_floor_pd(__m256d V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the rounded down
-/// values.
-#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
-
-/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
-/// source values are rounded up to integer values and returned as
-/// floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_ceil_ps(__m256 V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
-#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
-
-/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
-/// source values are rounded down to integer values and returned as
-/// floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_floor_ps(__m256 V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
-#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
-
-/* Logical */
-/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
-/// values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_and_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)((__v4du)__a & (__v4du)__b);
-}
-
-/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
-/// values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_and_ps(__m256 __a, __m256 __b)
-{
- return (__m256)((__v8su)__a & (__v8su)__b);
-}
-
-/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
-/// the one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the left source operand. The
-/// one's complement of this value is used in the bitwise AND.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing the right source operand.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
-/// values of the second operand and the one's complement of the first
-/// operand.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_andnot_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)(~(__v4du)__a & (__v4du)__b);
-}
-
-/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
-/// the one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the left source operand. The
-/// one's complement of this value is used in the bitwise AND.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing the right source operand.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
-/// values of the second operand and the one's complement of the first
-/// operand.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_andnot_ps(__m256 __a, __m256 __b)
-{
- return (__m256)(~(__v8su)__a & (__v8su)__b);
-}
-
-/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VORPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
-/// values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_or_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)((__v4du)__a | (__v4du)__b);
-}
-
-/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VORPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
-/// values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_or_ps(__m256 __a, __m256 __b)
-{
- return (__m256)((__v8su)__a | (__v8su)__b);
-}
-
-/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
-/// values between both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_xor_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)((__v4du)__a ^ (__v4du)__b);
-}
-
-/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
-/// values between both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_xor_ps(__m256 __a, __m256 __b)
-{
- return (__m256)((__v8su)__a ^ (__v8su)__b);
-}
-
-/* Horizontal arithmetic */
-/// Horizontally adds the adjacent pairs of values contained in two
-/// 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// The horizontal sums of the values are returned in the even-indexed
-/// elements of a vector of [4 x double].
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// The horizontal sums of the values are returned in the odd-indexed
-/// elements of a vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
-/// both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_hadd_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in two
-/// 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// The horizontal sums of the values are returned in the elements with
-/// index 0, 1, 4, 5 of a vector of [8 x float].
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// The horizontal sums of the values are returned in the elements with
-/// index 2, 3, 6, 7 of a vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
-/// both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hadd_ps(__m256 __a, __m256 __b)
-{
- return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in two
-/// 256-bit vectors of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// The horizontal differences between the values are returned in the
-/// even-indexed elements of a vector of [4 x double].
-/// \param __b
-/// A 256-bit vector of [4 x double] containing one of the source operands.
-/// The horizontal differences between the values are returned in the
-/// odd-indexed elements of a vector of [4 x double].
-/// \returns A 256-bit vector of [4 x double] containing the horizontal
-/// differences of both operands.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_hsub_pd(__m256d __a, __m256d __b)
-{
- return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in two
-/// 256-bit vectors of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// The horizontal differences between the values are returned in the
-/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
-/// \param __b
-/// A 256-bit vector of [8 x float] containing one of the source operands.
-/// The horizontal differences between the values are returned in the
-/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
-/// \returns A 256-bit vector of [8 x float] containing the horizontal
-/// differences of both operands.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_hsub_ps(__m256 __a, __m256 __b)
-{
- return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/* Vector permutations */
-/// Copies the values in a 128-bit vector of [2 x double] as specified
-/// by the 128-bit integer vector operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __c
-/// A 128-bit integer vector operand specifying how the values are to be
-/// copied. \n
-/// Bit [1]: \n
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-/// vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector. \n
-/// Bit [65]: \n
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline __m128d __DEFAULT_FN_ATTRS128
-_mm_permutevar_pd(__m128d __a, __m128i __c)
-{
- return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
-}
-
-/// Copies the values in a 256-bit vector of [4 x double] as specified
-/// by the 256-bit integer vector operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \param __c
-/// A 256-bit integer vector operand specifying how the values are to be
-/// copied. \n
-/// Bit [1]: \n
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-/// vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector. \n
-/// Bit [65]: \n
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector. \n
-/// Bit [129]: \n
-/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
-/// returned vector. \n
-/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
-/// returned vector. \n
-/// Bit [193]: \n
-/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
-/// returned vector. \n
-/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
-/// returned vector.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_permutevar_pd(__m256d __a, __m256i __c)
-{
- return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
-}
-
-/// Copies the values stored in a 128-bit vector of [4 x float] as
-/// specified by the 128-bit integer vector operand.
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __c
-/// A 128-bit integer vector operand specifying how the values are to be
-/// copied. \n
-/// Bits [1:0]: \n
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// Bits [33:32]: \n
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// Bits [65:64]: \n
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// Bits [97:96]: \n
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_permutevar_ps(__m128 __a, __m128i __c)
-{
- return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
-}
-
-/// Copies the values stored in a 256-bit vector of [8 x float] as
-/// specified by the 256-bit integer vector operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \param __c
-/// A 256-bit integer vector operand specifying how the values are to be
-/// copied. \n
-/// Bits [1:0]: \n
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// Bits [33:32]: \n
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// Bits [65:64]: \n
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// Bits [97:96]: \n
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// Bits [129:128]: \n
-/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// Bits [161:160]: \n
-/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// Bits [193:192]: \n
-/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// Bits [225:224]: \n
-/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_permutevar_ps(__m256 __a, __m256i __c)
-{
- return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
-}
-
-/// Copies the values in a 128-bit vector of [2 x double] as specified
-/// by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_permute_pd(__m128d A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param A
-/// A 128-bit vector of [2 x double].
-/// \param C
-/// An immediate integer operand specifying how the values are to be
-/// copied. \n
-/// Bit [0]: \n
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-/// vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector. \n
-/// Bit [1]: \n
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_permute_pd(A, C) \
- ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
-
-/// Copies the values in a 256-bit vector of [4 x double] as specified by
-/// the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_permute_pd(__m256d A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
-///
-/// \param A
-/// A 256-bit vector of [4 x double].
-/// \param C
-/// An immediate integer operand specifying how the values are to be
-/// copied. \n
-/// Bit [0]: \n
-/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
-/// vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
-/// returned vector. \n
-/// Bit [1]: \n
-/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
-/// returned vector. \n
-/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
-/// returned vector. \n
-/// Bit [2]: \n
-/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
-/// returned vector. \n
-/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
-/// returned vector. \n
-/// Bit [3]: \n
-/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
-/// returned vector. \n
-/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
-/// returned vector.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_permute_pd(A, C) \
- ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
-
-/// Copies the values in a 128-bit vector of [4 x float] as specified by
-/// the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_permute_ps(__m128 A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param A
-/// A 128-bit vector of [4 x float].
-/// \param C
-/// An immediate integer operand specifying how the values are to be
-/// copied. \n
-/// Bits [1:0]: \n
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// Bits [3:2]: \n
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// Bits [5:4]: \n
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// Bits [7:6]: \n
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_permute_ps(A, C) \
- ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
-
-/// Copies the values in a 256-bit vector of [8 x float] as specified by
-/// the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_permute_ps(__m256 A, const int C);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
-///
-/// \param A
-/// A 256-bit vector of [8 x float].
-/// \param C
-/// An immediate integer operand specifying how the values are to be
-/// copied. \n
-/// Bits [1:0]: \n
-/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
-/// returned vector. \n
-/// Bits [3:2]: \n
-/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
-/// returned vector. \n
-/// Bits [5:4]: \n
-/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
-/// returned vector. \n
-/// Bits [7:6]: \n
-/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
-/// returned vector. \n
-/// Bits [1:0]: \n
-/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
-/// returned vector. \n
-/// Bits [3:2]: \n
-/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
-/// returned vector. \n
-/// Bits [5:4]: \n
-/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
-/// returned vector. \n
-/// Bits [7:6]: \n
-/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
-/// returned vector. \n
-/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
-/// returned vector. \n
-/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
-/// returned vector. \n
-/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
-/// returned vector.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_permute_ps(A, C) \
- ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
-
-/// Permutes 128-bit data values stored in two 256-bit vectors of
-/// [4 x double], as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
-///
-/// \param V1
-/// A 256-bit vector of [4 x double].
-/// \param V2
-/// A 256-bit vector of [4 x double.
-/// \param M
-/// An immediate integer operand specifying how the values are to be
-/// permuted. \n
-/// Bits [1:0]: \n
-/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
-/// destination. \n
-/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
-/// destination. \n
-/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
-/// destination. \n
-/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
-/// destination. \n
-/// Bits [5:4]: \n
-/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
-/// destination. \n
-/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
-/// destination. \n
-/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
-/// destination. \n
-/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
-/// destination.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_permute2f128_pd(V1, V2, M) \
- ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
- (__v4df)(__m256d)(V2), (int)(M)))
-
-/// Permutes 128-bit data values stored in two 256-bit vectors of
-/// [8 x float], as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
-///
-/// \param V1
-/// A 256-bit vector of [8 x float].
-/// \param V2
-/// A 256-bit vector of [8 x float].
-/// \param M
-/// An immediate integer operand specifying how the values are to be
-/// permuted. \n
-/// Bits [1:0]: \n
-/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
-/// destination. \n
-/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
-/// destination. \n
-/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
-/// destination. \n
-/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
-/// destination. \n
-/// Bits [5:4]: \n
-/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
-/// destination. \n
-/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
-/// destination. \n
-/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
-/// destination. \n
-/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
-/// destination.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_permute2f128_ps(V1, V2, M) \
- ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
- (__v8sf)(__m256)(V2), (int)(M)))
-
-/// Permutes 128-bit data values stored in two 256-bit integer vectors,
-/// as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
-///
-/// \param V1
-/// A 256-bit integer vector.
-/// \param V2
-/// A 256-bit integer vector.
-/// \param M
-/// An immediate integer operand specifying how the values are to be copied.
-/// Bits [1:0]: \n
-/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
-/// destination. \n
-/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
-/// destination. \n
-/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
-/// destination. \n
-/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
-/// destination. \n
-/// Bits [5:4]: \n
-/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
-/// destination. \n
-/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
-/// destination. \n
-/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
-/// destination. \n
-/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
-/// destination.
-/// \returns A 256-bit integer vector containing the copied values.
-#define _mm256_permute2f128_si256(V1, V2, M) \
- ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
- (__v8si)(__m256i)(V2), (int)(M)))
-
-/* Vector Blend */
-/// Merges 64-bit double-precision data values stored in either of the
-/// two 256-bit vectors of [4 x double], as specified by the immediate
-/// integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
-///
-/// \param V1
-/// A 256-bit vector of [4 x double].
-/// \param V2
-/// A 256-bit vector of [4 x double].
-/// \param M
-/// An immediate integer operand, with mask bits [3:0] specifying how the
-/// values are to be copied. The position of the mask bit corresponds to the
-/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
-/// element in operand \a V1 is copied to the same position in the
-/// destination. When a mask bit is 1, the corresponding 64-bit element in
-/// operand \a V2 is copied to the same position in the destination.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-#define _mm256_blend_pd(V1, V2, M) \
- ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
- (__v4df)(__m256d)(V2), (int)(M)))
-
-/// Merges 32-bit single-precision data values stored in either of the
-/// two 256-bit vectors of [8 x float], as specified by the immediate
-/// integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
-///
-/// \param V1
-/// A 256-bit vector of [8 x float].
-/// \param V2
-/// A 256-bit vector of [8 x float].
-/// \param M
-/// An immediate integer operand, with mask bits [7:0] specifying how the
-/// values are to be copied. The position of the mask bit corresponds to the
-/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
-/// element in operand \a V1 is copied to the same position in the
-/// destination. When a mask bit is 1, the corresponding 32-bit element in
-/// operand \a V2 is copied to the same position in the destination.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-#define _mm256_blend_ps(V1, V2, M) \
- ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
- (__v8sf)(__m256)(V2), (int)(M)))
-
-/// Merges 64-bit double-precision data values stored in either of the
-/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
-/// operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \param __b
-/// A 256-bit vector of [4 x double].
-/// \param __c
-/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
-/// how the values are to be copied. The position of the mask bit corresponds
-/// to the most significant bit of a copied value. When a mask bit is 0, the
-/// corresponding 64-bit element in operand \a __a is copied to the same
-/// position in the destination. When a mask bit is 1, the corresponding
-/// 64-bit element in operand \a __b is copied to the same position in the
-/// destination.
-/// \returns A 256-bit vector of [4 x double] containing the copied values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
-{
- return (__m256d)__builtin_ia32_blendvpd256(
- (__v4df)__a, (__v4df)__b, (__v4df)__c);
-}
-
-/// Merges 32-bit single-precision data values stored in either of the
-/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
-/// operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \param __b
-/// A 256-bit vector of [8 x float].
-/// \param __c
-/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
-/// and 31 specifying how the values are to be copied. The position of the
-/// mask bit corresponds to the most significant bit of a copied value. When
-/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
-/// copied to the same position in the destination. When a mask bit is 1, the
-/// corresponding 32-bit element in operand \a __b is copied to the same
-/// position in the destination.
-/// \returns A 256-bit vector of [8 x float] containing the copied values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
-{
- return (__m256)__builtin_ia32_blendvps256(
- (__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
-}
-
-/* Vector Dot Product */
-/// Computes two dot products in parallel, using the lower and upper
-/// halves of two [8 x float] vectors as input to the two computations, and
-/// returning the two dot products in the lower and upper halves of the
-/// [8 x float] result.
-///
-/// The immediate integer operand controls which input elements will
-/// contribute to the dot product, and where the final results are returned.
-/// In general, for each dot product, the four corresponding elements of the
-/// input vectors are multiplied; the first two and second two products are
-/// summed, then the two sums are added to form the final result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
-///
-/// \param V1
-/// A vector of [8 x float] values, treated as two [4 x float] vectors.
-/// \param V2
-/// A vector of [8 x float] values, treated as two [4 x float] vectors.
-/// \param M
-/// An immediate integer argument. Bits [7:4] determine which elements of
-/// the input vectors are used, with bit [4] corresponding to the lowest
-/// element and bit [7] corresponding to the highest element of each [4 x
-/// float] subvector. If a bit is set, the corresponding elements from the
-/// two input vectors are used as an input for dot product; otherwise that
-/// input is treated as zero. Bits [3:0] determine which elements of the
-/// result will receive a copy of the final dot product, with bit [0]
-/// corresponding to the lowest element and bit [3] corresponding to the
-/// highest element of each [4 x float] subvector. If a bit is set, the dot
-/// product is returned in the corresponding element; otherwise that element
-/// is set to zero. The bitmask is applied in the same way to each of the
-/// two parallel dot product computations.
-/// \returns A 256-bit vector of [8 x float] containing the two dot products.
-#define _mm256_dp_ps(V1, V2, M) \
- ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
- (__v8sf)(__m256)(V2), (M)))
-
-/* Vector shuffle */
-/// Selects 8 float values from the 256-bit operands of [8 x float], as
-/// specified by the immediate value operand.
-///
-/// The four selected elements in each operand are copied to the destination
-/// according to the bits specified in the immediate operand. The selected
-/// elements from the first 256-bit operand are copied to bits [63:0] and
-/// bits [191:128] of the destination, and the selected elements from the
-/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
-/// the destination. For example, if bits [7:0] of the immediate operand
-/// contain a value of 0xFF, the 256-bit destination vector would contain the
-/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
-///
-/// \param a
-/// A 256-bit vector of [8 x float]. The four selected elements in this
-/// operand are copied to bits [63:0] and bits [191:128] in the destination,
-/// according to the bits specified in the immediate operand.
-/// \param b
-/// A 256-bit vector of [8 x float]. The four selected elements in this
-/// operand are copied to bits [127:64] and bits [255:192] in the
-/// destination, according to the bits specified in the immediate operand.
-/// \param mask
-/// An immediate value containing an 8-bit value specifying which elements to
-/// copy from \a a and \a b \n.
-/// Bits [3:0] specify the values copied from operand \a a. \n
-/// Bits [7:4] specify the values copied from operand \a b. \n
-/// The destinations within the 256-bit destination are assigned values as
-/// follows, according to the bit value assignments described below: \n
-/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
-/// destination. \n
-/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
-/// destination. \n
-/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
-/// destination. \n
-/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
-/// the destination. \n
-/// Bit value assignments: \n
-/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
-/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
-/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
-/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
-/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
-#define _mm256_shuffle_ps(a, b, mask) \
- ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
- (__v8sf)(__m256)(b), (int)(mask)))
-
-/// Selects four double-precision values from the 256-bit operands of
-/// [4 x double], as specified by the immediate value operand.
-///
-/// The selected elements from the first 256-bit operand are copied to bits
-/// [63:0] and bits [191:128] in the destination, and the selected elements
-/// from the second 256-bit operand are copied to bits [127:64] and bits
-/// [255:192] in the destination. For example, if bits [3:0] of the immediate
-/// operand contain a value of 0xF, the 256-bit destination vector would
-/// contain the following values: b[3], a[3], b[1], a[1].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
-///
-/// \param a
-/// A 256-bit vector of [4 x double].
-/// \param b
-/// A 256-bit vector of [4 x double].
-/// \param mask
-/// An immediate value containing 8-bit values specifying which elements to
-/// copy from \a a and \a b: \n
-/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
-/// destination. \n
-/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
-/// destination. \n
-/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
-/// destination. \n
-/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
-/// destination. \n
-/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
-/// destination. \n
-/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
-/// destination. \n
-/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
-/// destination. \n
-/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
-/// destination.
-/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
-#define _mm256_shuffle_pd(a, b, mask) \
- ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
- (__v4df)(__m256d)(b), (int)(mask)))
-
-/* Compare */
-#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
-#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
-#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
-#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
-#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
-#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
-#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
-#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
-#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
-#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
-#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
-#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
-#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
-#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
-#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
-#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
-#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
-#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
-#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
-#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
-#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
-#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
-#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
-#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
-#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
-#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
-#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
-#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
-#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
-#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
-#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
-#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
-
-/// Compares each of the corresponding double-precision values of two
-/// 128-bit vectors of [2 x double], using the operation specified by the
-/// immediate integer operand.
-///
-/// Returns a [2 x double] vector consisting of two doubles corresponding to
-/// the two comparison results: zero if the comparison is false, and all 1's
-/// if the comparison is true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
-///
-/// \param a
-/// A 128-bit vector of [2 x double].
-/// \param b
-/// A 128-bit vector of [2 x double].
-/// \param c
-/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use: \n
-/// 0x00: Equal (ordered, non-signaling) \n
-/// 0x01: Less-than (ordered, signaling) \n
-/// 0x02: Less-than-or-equal (ordered, signaling) \n
-/// 0x03: Unordered (non-signaling) \n
-/// 0x04: Not-equal (unordered, non-signaling) \n
-/// 0x05: Not-less-than (unordered, signaling) \n
-/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
-/// 0x07: Ordered (non-signaling) \n
-/// 0x08: Equal (unordered, non-signaling) \n
-/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
-/// 0x0A: Not-greater-than (unordered, signaling) \n
-/// 0x0B: False (ordered, non-signaling) \n
-/// 0x0C: Not-equal (ordered, non-signaling) \n
-/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
-/// 0x0E: Greater-than (ordered, signaling) \n
-/// 0x0F: True (unordered, non-signaling) \n
-/// 0x10: Equal (ordered, signaling) \n
-/// 0x11: Less-than (ordered, non-signaling) \n
-/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
-/// 0x13: Unordered (signaling) \n
-/// 0x14: Not-equal (unordered, signaling) \n
-/// 0x15: Not-less-than (unordered, non-signaling) \n
-/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-/// 0x17: Ordered (signaling) \n
-/// 0x18: Equal (unordered, signaling) \n
-/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-/// 0x1A: Not-greater-than (unordered, non-signaling) \n
-/// 0x1B: False (ordered, signaling) \n
-/// 0x1C: Not-equal (ordered, signaling) \n
-/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-/// 0x1E: Greater-than (ordered, non-signaling) \n
-/// 0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_pd(a, b, c) \
- ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
- (__v2df)(__m128d)(b), (c)))
-
-/// Compares each of the corresponding values of two 128-bit vectors of
-/// [4 x float], using the operation specified by the immediate integer
-/// operand.
-///
-/// Returns a [4 x float] vector consisting of four floats corresponding to
-/// the four comparison results: zero if the comparison is false, and all 1's
-/// if the comparison is true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
-///
-/// \param a
-/// A 128-bit vector of [4 x float].
-/// \param b
-/// A 128-bit vector of [4 x float].
-/// \param c
-/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use: \n
-/// 0x00: Equal (ordered, non-signaling) \n
-/// 0x01: Less-than (ordered, signaling) \n
-/// 0x02: Less-than-or-equal (ordered, signaling) \n
-/// 0x03: Unordered (non-signaling) \n
-/// 0x04: Not-equal (unordered, non-signaling) \n
-/// 0x05: Not-less-than (unordered, signaling) \n
-/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
-/// 0x07: Ordered (non-signaling) \n
-/// 0x08: Equal (unordered, non-signaling) \n
-/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
-/// 0x0A: Not-greater-than (unordered, signaling) \n
-/// 0x0B: False (ordered, non-signaling) \n
-/// 0x0C: Not-equal (ordered, non-signaling) \n
-/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
-/// 0x0E: Greater-than (ordered, signaling) \n
-/// 0x0F: True (unordered, non-signaling) \n
-/// 0x10: Equal (ordered, signaling) \n
-/// 0x11: Less-than (ordered, non-signaling) \n
-/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
-/// 0x13: Unordered (signaling) \n
-/// 0x14: Not-equal (unordered, signaling) \n
-/// 0x15: Not-less-than (unordered, non-signaling) \n
-/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-/// 0x17: Ordered (signaling) \n
-/// 0x18: Equal (unordered, signaling) \n
-/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-/// 0x1A: Not-greater-than (unordered, non-signaling) \n
-/// 0x1B: False (ordered, signaling) \n
-/// 0x1C: Not-equal (ordered, signaling) \n
-/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-/// 0x1E: Greater-than (ordered, non-signaling) \n
-/// 0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ps(a, b, c) \
- ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
- (__v4sf)(__m128)(b), (c)))
-
-/// Compares each of the corresponding double-precision values of two
-/// 256-bit vectors of [4 x double], using the operation specified by the
-/// immediate integer operand.
-///
-/// Returns a [4 x double] vector consisting of four doubles corresponding to
-/// the four comparison results: zero if the comparison is false, and all 1's
-/// if the comparison is true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
-///
-/// \param a
-/// A 256-bit vector of [4 x double].
-/// \param b
-/// A 256-bit vector of [4 x double].
-/// \param c
-/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use: \n
-/// 0x00: Equal (ordered, non-signaling) \n
-/// 0x01: Less-than (ordered, signaling) \n
-/// 0x02: Less-than-or-equal (ordered, signaling) \n
-/// 0x03: Unordered (non-signaling) \n
-/// 0x04: Not-equal (unordered, non-signaling) \n
-/// 0x05: Not-less-than (unordered, signaling) \n
-/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
-/// 0x07: Ordered (non-signaling) \n
-/// 0x08: Equal (unordered, non-signaling) \n
-/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
-/// 0x0A: Not-greater-than (unordered, signaling) \n
-/// 0x0B: False (ordered, non-signaling) \n
-/// 0x0C: Not-equal (ordered, non-signaling) \n
-/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
-/// 0x0E: Greater-than (ordered, signaling) \n
-/// 0x0F: True (unordered, non-signaling) \n
-/// 0x10: Equal (ordered, signaling) \n
-/// 0x11: Less-than (ordered, non-signaling) \n
-/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
-/// 0x13: Unordered (signaling) \n
-/// 0x14: Not-equal (unordered, signaling) \n
-/// 0x15: Not-less-than (unordered, non-signaling) \n
-/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-/// 0x17: Ordered (signaling) \n
-/// 0x18: Equal (unordered, signaling) \n
-/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-/// 0x1A: Not-greater-than (unordered, non-signaling) \n
-/// 0x1B: False (ordered, signaling) \n
-/// 0x1C: Not-equal (ordered, signaling) \n
-/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-/// 0x1E: Greater-than (ordered, non-signaling) \n
-/// 0x1F: True (unordered, signaling)
-/// \returns A 256-bit vector of [4 x double] containing the comparison results.
-#define _mm256_cmp_pd(a, b, c) \
- ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
- (__v4df)(__m256d)(b), (c)))
-
-/// Compares each of the corresponding values of two 256-bit vectors of
-/// [8 x float], using the operation specified by the immediate integer
-/// operand.
-///
-/// Returns a [8 x float] vector consisting of eight floats corresponding to
-/// the eight comparison results: zero if the comparison is false, and all
-/// 1's if the comparison is true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
-///
-/// \param a
-/// A 256-bit vector of [8 x float].
-/// \param b
-/// A 256-bit vector of [8 x float].
-/// \param c
-/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use: \n
-/// 0x00: Equal (ordered, non-signaling) \n
-/// 0x01: Less-than (ordered, signaling) \n
-/// 0x02: Less-than-or-equal (ordered, signaling) \n
-/// 0x03: Unordered (non-signaling) \n
-/// 0x04: Not-equal (unordered, non-signaling) \n
-/// 0x05: Not-less-than (unordered, signaling) \n
-/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
-/// 0x07: Ordered (non-signaling) \n
-/// 0x08: Equal (unordered, non-signaling) \n
-/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
-/// 0x0A: Not-greater-than (unordered, signaling) \n
-/// 0x0B: False (ordered, non-signaling) \n
-/// 0x0C: Not-equal (ordered, non-signaling) \n
-/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
-/// 0x0E: Greater-than (ordered, signaling) \n
-/// 0x0F: True (unordered, non-signaling) \n
-/// 0x10: Equal (ordered, signaling) \n
-/// 0x11: Less-than (ordered, non-signaling) \n
-/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
-/// 0x13: Unordered (signaling) \n
-/// 0x14: Not-equal (unordered, signaling) \n
-/// 0x15: Not-less-than (unordered, non-signaling) \n
-/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-/// 0x17: Ordered (signaling) \n
-/// 0x18: Equal (unordered, signaling) \n
-/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-/// 0x1A: Not-greater-than (unordered, non-signaling) \n
-/// 0x1B: False (ordered, signaling) \n
-/// 0x1C: Not-equal (ordered, signaling) \n
-/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-/// 0x1E: Greater-than (ordered, non-signaling) \n
-/// 0x1F: True (unordered, signaling)
-/// \returns A 256-bit vector of [8 x float] containing the comparison results.
-#define _mm256_cmp_ps(a, b, c) \
- ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
- (__v8sf)(__m256)(b), (c)))
-
-/// Compares each of the corresponding scalar double-precision values of
-/// two 128-bit vectors of [2 x double], using the operation specified by the
-/// immediate integer operand.
-///
-/// If the result is true, all 64 bits of the destination vector are set;
-/// otherwise they are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
-///
-/// \param a
-/// A 128-bit vector of [2 x double].
-/// \param b
-/// A 128-bit vector of [2 x double].
-/// \param c
-/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use: \n
-/// 0x00: Equal (ordered, non-signaling) \n
-/// 0x01: Less-than (ordered, signaling) \n
-/// 0x02: Less-than-or-equal (ordered, signaling) \n
-/// 0x03: Unordered (non-signaling) \n
-/// 0x04: Not-equal (unordered, non-signaling) \n
-/// 0x05: Not-less-than (unordered, signaling) \n
-/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
-/// 0x07: Ordered (non-signaling) \n
-/// 0x08: Equal (unordered, non-signaling) \n
-/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
-/// 0x0A: Not-greater-than (unordered, signaling) \n
-/// 0x0B: False (ordered, non-signaling) \n
-/// 0x0C: Not-equal (ordered, non-signaling) \n
-/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
-/// 0x0E: Greater-than (ordered, signaling) \n
-/// 0x0F: True (unordered, non-signaling) \n
-/// 0x10: Equal (ordered, signaling) \n
-/// 0x11: Less-than (ordered, non-signaling) \n
-/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
-/// 0x13: Unordered (signaling) \n
-/// 0x14: Not-equal (unordered, signaling) \n
-/// 0x15: Not-less-than (unordered, non-signaling) \n
-/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-/// 0x17: Ordered (signaling) \n
-/// 0x18: Equal (unordered, signaling) \n
-/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-/// 0x1A: Not-greater-than (unordered, non-signaling) \n
-/// 0x1B: False (ordered, signaling) \n
-/// 0x1C: Not-equal (ordered, signaling) \n
-/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-/// 0x1E: Greater-than (ordered, non-signaling) \n
-/// 0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [2 x double] containing the comparison results.
-#define _mm_cmp_sd(a, b, c) \
- ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
- (__v2df)(__m128d)(b), (c)))
-
-/// Compares each of the corresponding scalar values of two 128-bit
-/// vectors of [4 x float], using the operation specified by the immediate
-/// integer operand.
-///
-/// If the result is true, all 32 bits of the destination vector are set;
-/// otherwise they are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
-///
-/// \param a
-/// A 128-bit vector of [4 x float].
-/// \param b
-/// A 128-bit vector of [4 x float].
-/// \param c
-/// An immediate integer operand, with bits [4:0] specifying which comparison
-/// operation to use: \n
-/// 0x00: Equal (ordered, non-signaling) \n
-/// 0x01: Less-than (ordered, signaling) \n
-/// 0x02: Less-than-or-equal (ordered, signaling) \n
-/// 0x03: Unordered (non-signaling) \n
-/// 0x04: Not-equal (unordered, non-signaling) \n
-/// 0x05: Not-less-than (unordered, signaling) \n
-/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
-/// 0x07: Ordered (non-signaling) \n
-/// 0x08: Equal (unordered, non-signaling) \n
-/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
-/// 0x0A: Not-greater-than (unordered, signaling) \n
-/// 0x0B: False (ordered, non-signaling) \n
-/// 0x0C: Not-equal (ordered, non-signaling) \n
-/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
-/// 0x0E: Greater-than (ordered, signaling) \n
-/// 0x0F: True (unordered, non-signaling) \n
-/// 0x10: Equal (ordered, signaling) \n
-/// 0x11: Less-than (ordered, non-signaling) \n
-/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
-/// 0x13: Unordered (signaling) \n
-/// 0x14: Not-equal (unordered, signaling) \n
-/// 0x15: Not-less-than (unordered, non-signaling) \n
-/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
-/// 0x17: Ordered (signaling) \n
-/// 0x18: Equal (unordered, signaling) \n
-/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
-/// 0x1A: Not-greater-than (unordered, non-signaling) \n
-/// 0x1B: False (ordered, signaling) \n
-/// 0x1C: Not-equal (ordered, signaling) \n
-/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
-/// 0x1E: Greater-than (ordered, non-signaling) \n
-/// 0x1F: True (unordered, signaling)
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-#define _mm_cmp_ss(a, b, c) \
- ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
- (__v4sf)(__m128)(b), (c)))
-
-/// Takes a [8 x i32] vector and returns the vector element value
-/// indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x i32].
-/// \param __imm
-/// An immediate integer operand with bits [2:0] determining which vector
-/// element is extracted and returned.
-/// \returns A 32-bit integer containing the extracted 32 bits of extended
-/// packed data.
-#define _mm256_extract_epi32(X, N) \
- ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
-
-/// Takes a [16 x i16] vector and returns the vector element value
-/// indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A 256-bit integer vector of [16 x i16].
-/// \param __imm
-/// An immediate integer operand with bits [3:0] determining which vector
-/// element is extracted and returned.
-/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
-/// packed data.
-#define _mm256_extract_epi16(X, N) \
- ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
- (int)(N)))
-
-/// Takes a [32 x i8] vector and returns the vector element value
-/// indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A 256-bit integer vector of [32 x i8].
-/// \param __imm
-/// An immediate integer operand with bits [4:0] determining which vector
-/// element is extracted and returned.
-/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
-/// packed data.
-#define _mm256_extract_epi8(X, N) \
- ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
- (int)(N)))
-
-#ifdef __x86_64__
-/// Takes a [4 x i64] vector and returns the vector element value
-/// indexed by the immediate constant operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A 256-bit integer vector of [4 x i64].
-/// \param __imm
-/// An immediate integer operand with bits [1:0] determining which vector
-/// element is extracted and returned.
-/// \returns A 64-bit integer containing the extracted 64 bits of extended
-/// packed data.
-#define _mm256_extract_epi64(X, N) \
- ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
-#endif
-
-/// Takes a [8 x i32] vector and replaces the vector element value
-/// indexed by the immediate constant operand by a new value. Returns the
-/// modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A vector of [8 x i32] to be used by the insert operation.
-/// \param __b
-/// An integer value. The replacement value for the insert operation.
-/// \param __imm
-/// An immediate integer specifying the index of the vector element to be
-/// replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-/// \a __imm with \a __b.
-#define _mm256_insert_epi32(X, I, N) \
- ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
- (int)(I), (int)(N)))
-
-
-/// Takes a [16 x i16] vector and replaces the vector element value
-/// indexed by the immediate constant operand with a new value. Returns the
-/// modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A vector of [16 x i16] to be used by the insert operation.
-/// \param __b
-/// An i16 integer value. The replacement value for the insert operation.
-/// \param __imm
-/// An immediate integer specifying the index of the vector element to be
-/// replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-/// \a __imm with \a __b.
-#define _mm256_insert_epi16(X, I, N) \
- ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
- (int)(I), (int)(N)))
-
-/// Takes a [32 x i8] vector and replaces the vector element value
-/// indexed by the immediate constant operand with a new value. Returns the
-/// modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A vector of [32 x i8] to be used by the insert operation.
-/// \param __b
-/// An i8 integer value. The replacement value for the insert operation.
-/// \param __imm
-/// An immediate integer specifying the index of the vector element to be
-/// replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-/// \a __imm with \a __b.
-#define _mm256_insert_epi8(X, I, N) \
- ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
- (int)(I), (int)(N)))
-
-#ifdef __x86_64__
-/// Takes a [4 x i64] vector and replaces the vector element value
-/// indexed by the immediate constant operand with a new value. Returns the
-/// modified vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
-/// instruction.
-///
-/// \param __a
-/// A vector of [4 x i64] to be used by the insert operation.
-/// \param __b
-/// A 64-bit integer value. The replacement value for the insert operation.
-/// \param __imm
-/// An immediate integer specifying the index of the vector element to be
-/// replaced.
-/// \returns A copy of vector \a __a, after replacing its element indexed by
-/// \a __imm with \a __b.
-#define _mm256_insert_epi64(X, I, N) \
- ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
- (long long)(I), (int)(N)))
-#endif
-
-/* Conversion */
-/// Converts a vector of [4 x i32] into a vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector of [4 x i32].
-/// \returns A 256-bit vector of [4 x double] containing the converted values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_cvtepi32_pd(__m128i __a)
-{
- return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
-}
-
-/// Converts a vector of [8 x i32] into a vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \returns A 256-bit vector of [8 x float] containing the converted values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_cvtepi32_ps(__m256i __a)
-{
- return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
-}
-
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \returns A 128-bit vector of [4 x float] containing the converted values.
-static __inline __m128 __DEFAULT_FN_ATTRS
-_mm256_cvtpd_ps(__m256d __a)
-{
- return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
-}
-
-/// Converts a vector of [8 x float] into a vector of [8 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit integer vector containing the converted values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_cvtps_epi32(__m256 __a)
-{
- return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
-}
-
-/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
-/// x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 256-bit vector of [4 x double] containing the converted values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_cvtps_pd(__m128 __a)
-{
- return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
-}
-
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-/// x i32], truncating the result by rounding towards zero when it is
-/// inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \returns A 128-bit integer vector containing the converted values.
-static __inline __m128i __DEFAULT_FN_ATTRS
-_mm256_cvttpd_epi32(__m256d __a)
-{
- return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
-}
-
-/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
-/// x i32]. When a conversion is inexact, the value returned is rounded
-/// according to the rounding control bits in the MXCSR register.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \returns A 128-bit integer vector containing the converted values.
-static __inline __m128i __DEFAULT_FN_ATTRS
-_mm256_cvtpd_epi32(__m256d __a)
-{
- return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
-}
-
-/// Converts a vector of [8 x float] into a vector of [8 x i32],
-/// truncating the result by rounding towards zero when it is inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \returns A 256-bit integer vector containing the converted values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_cvttps_epi32(__m256 __a)
-{
- return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
-}
-
-/// Returns the first element of the input vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \returns A 64 bit double containing the first element of the input vector.
-static __inline double __DEFAULT_FN_ATTRS
-_mm256_cvtsd_f64(__m256d __a)
-{
- return __a[0];
-}
-
-/// Returns the first element of the input vector of [8 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x i32].
-/// \returns A 32 bit integer containing the first element of the input vector.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_cvtsi256_si32(__m256i __a)
-{
- __v8si __b = (__v8si)__a;
- return __b[0];
-}
-
-/// Returns the first element of the input vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \returns A 32 bit float containing the first element of the input vector.
-static __inline float __DEFAULT_FN_ATTRS
-_mm256_cvtss_f32(__m256 __a)
-{
- return __a[0];
-}
-
-/* Vector replicate */
-/// Moves and duplicates odd-indexed values from a 256-bit vector of
-/// [8 x float] to float values in a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float]. \n
-/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
-/// the return value. \n
-/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
-/// the return value. \n
-/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
-/// return value. \n
-/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
-/// return value.
-/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
-/// values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_movehdup_ps(__m256 __a)
-{
- return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
-}
-
-/// Moves and duplicates even-indexed values from a 256-bit vector of
-/// [8 x float] to float values in a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float]. \n
-/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
-/// the return value. \n
-/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
-/// the return value. \n
-/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
-/// return value. \n
-/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
-/// return value.
-/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
-/// values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_moveldup_ps(__m256 __a)
-{
- return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
-}
-
-/// Moves and duplicates double-precision floating point values from a
-/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
-/// vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double]. \n
-/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
-/// return value. \n
-/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
-/// the return value.
-/// \returns A 256-bit vector of [4 x double] containing the moved and
-/// duplicated values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_movedup_pd(__m256d __a)
-{
- return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
-}
-
-/* Unpack and Interleave */
-/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
-/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [4 x double]. \n
-/// Bits [127:64] are written to bits [63:0] of the return value. \n
-/// Bits [255:192] are written to bits [191:128] of the return value. \n
-/// \param __b
-/// A 256-bit floating-point vector of [4 x double]. \n
-/// Bits [127:64] are written to bits [127:64] of the return value. \n
-/// Bits [255:192] are written to bits [255:192] of the return value. \n
-/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpackhi_pd(__m256d __a, __m256d __b)
-{
- return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
-}
-
-/// Unpacks the even-indexed vector elements from two 256-bit vectors of
-/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [4 x double]. \n
-/// Bits [63:0] are written to bits [63:0] of the return value. \n
-/// Bits [191:128] are written to bits [191:128] of the return value.
-/// \param __b
-/// A 256-bit floating-point vector of [4 x double]. \n
-/// Bits [63:0] are written to bits [127:64] of the return value. \n
-/// Bits [191:128] are written to bits [255:192] of the return value. \n
-/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_unpacklo_pd(__m256d __a, __m256d __b)
-{
- return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
-}
-
-/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
-/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
-/// vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float]. \n
-/// Bits [95:64] are written to bits [31:0] of the return value. \n
-/// Bits [127:96] are written to bits [95:64] of the return value. \n
-/// Bits [223:192] are written to bits [159:128] of the return value. \n
-/// Bits [255:224] are written to bits [223:192] of the return value.
-/// \param __b
-/// A 256-bit vector of [8 x float]. \n
-/// Bits [95:64] are written to bits [63:32] of the return value. \n
-/// Bits [127:96] are written to bits [127:96] of the return value. \n
-/// Bits [223:192] are written to bits [191:160] of the return value. \n
-/// Bits [255:224] are written to bits [255:224] of the return value.
-/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpackhi_ps(__m256 __a, __m256 __b)
-{
- return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
-}
-
-/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
-/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
-/// vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float]. \n
-/// Bits [31:0] are written to bits [31:0] of the return value. \n
-/// Bits [63:32] are written to bits [95:64] of the return value. \n
-/// Bits [159:128] are written to bits [159:128] of the return value. \n
-/// Bits [191:160] are written to bits [223:192] of the return value.
-/// \param __b
-/// A 256-bit vector of [8 x float]. \n
-/// Bits [31:0] are written to bits [63:32] of the return value. \n
-/// Bits [63:32] are written to bits [127:96] of the return value. \n
-/// Bits [159:128] are written to bits [191:160] of the return value. \n
-/// Bits [191:160] are written to bits [255:224] of the return value.
-/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_unpacklo_ps(__m256 __a, __m256 __b)
-{
- return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
-}
-
-/* Bit Test */
-/// Given two 128-bit floating-point vectors of [2 x double], perform an
-/// element-by-element comparison of the double-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns the ZF flag in the EFLAGS register.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testz_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [2 x double], perform an
-/// element-by-element comparison of the double-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns the CF flag in the EFLAGS register.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testc_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [2 x double], perform an
-/// element-by-element comparison of the double-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-/// otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testnzc_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [4 x float], perform an
-/// element-by-element comparison of the single-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testz_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [4 x float], perform an
-/// element-by-element comparison of the single-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testc_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Given two 128-bit floating-point vectors of [4 x float], perform an
-/// element-by-element comparison of the single-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-/// otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS128
-_mm_testnzc_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [4 x double], perform an
-/// element-by-element comparison of the double-precision elements in the
-/// first source vector and the corresponding elements in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \param __b
-/// A 256-bit vector of [4 x double].
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testz_pd(__m256d __a, __m256d __b)
-{
- return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [4 x double], perform an
-/// element-by-element comparison of the double-precision elements in the
-/// first source vector and the corresponding elements in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \param __b
-/// A 256-bit vector of [4 x double].
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testc_pd(__m256d __a, __m256d __b)
-{
- return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [4 x double], perform an
-/// element-by-element comparison of the double-precision elements in the
-/// first source vector and the corresponding elements in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of double-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-/// otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double].
-/// \param __b
-/// A 256-bit vector of [4 x double].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testnzc_pd(__m256d __a, __m256d __b)
-{
- return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [8 x float], perform an
-/// element-by-element comparison of the single-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \param __b
-/// A 256-bit vector of [8 x float].
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testz_ps(__m256 __a, __m256 __b)
-{
- return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [8 x float], perform an
-/// element-by-element comparison of the single-precision element in the
-/// first source vector and the corresponding element in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \param __b
-/// A 256-bit vector of [8 x float].
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testc_ps(__m256 __a, __m256 __b)
-{
- return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Given two 256-bit floating-point vectors of [8 x float], perform an
-/// element-by-element comparison of the single-precision elements in the
-/// first source vector and the corresponding elements in the second source
-/// vector.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
-/// ZF flag is set to 1. \n
-/// If there is at least one pair of single-precision elements where the
-/// sign-bit of the first element is 0 and the sign-bit of the second element
-/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-/// otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float].
-/// \param __b
-/// A 256-bit vector of [8 x float].
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testnzc_ps(__m256 __a, __m256 __b)
-{
- return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
-}
-
-/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
-/// of the two source vectors.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of bits where both bits are 1, the ZF flag
-/// is set to 0. Otherwise the ZF flag is set to 1. \n
-/// If there is at least one pair of bits where the bit from the first source
-/// vector is 0 and the bit from the second source vector is 1, the CF flag
-/// is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the ZF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \param __b
-/// A 256-bit integer vector.
-/// \returns the ZF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testz_si256(__m256i __a, __m256i __b)
-{
- return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
-}
-
-/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
-/// of the two source vectors.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of bits where both bits are 1, the ZF flag
-/// is set to 0. Otherwise the ZF flag is set to 1. \n
-/// If there is at least one pair of bits where the bit from the first source
-/// vector is 0 and the bit from the second source vector is 1, the CF flag
-/// is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns the value of the CF flag.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \param __b
-/// A 256-bit integer vector.
-/// \returns the CF flag.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testc_si256(__m256i __a, __m256i __b)
-{
- return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
-}
-
-/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
-/// of the two source vectors.
-///
-/// The EFLAGS register is updated as follows: \n
-/// If there is at least one pair of bits where both bits are 1, the ZF flag
-/// is set to 0. Otherwise the ZF flag is set to 1. \n
-/// If there is at least one pair of bits where the bit from the first source
-/// vector is 0 and the bit from the second source vector is 1, the CF flag
-/// is set to 0. Otherwise the CF flag is set to 1. \n
-/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
-/// otherwise it returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \param __b
-/// A 256-bit integer vector.
-/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_testnzc_si256(__m256i __a, __m256i __b)
-{
- return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
-}
-
-/* Vector extract sign mask */
-/// Extracts the sign bits of double-precision floating point elements
-/// in a 256-bit vector of [4 x double] and writes them to the lower order
-/// bits of the return value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the double-precision
-/// floating point values with sign bits to be extracted.
-/// \returns The sign bits from the operand, written to bits [3:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_pd(__m256d __a)
-{
- return __builtin_ia32_movmskpd256((__v4df)__a);
-}
-
-/// Extracts the sign bits of single-precision floating point elements
-/// in a 256-bit vector of [8 x float] and writes them to the lower order
-/// bits of the return value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
-///
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the single-precision floating
-/// point values with sign bits to be extracted.
-/// \returns The sign bits from the operand, written to bits [7:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_ps(__m256 __a)
-{
- return __builtin_ia32_movmskps256((__v8sf)__a);
-}
-
-/* Vector __zero */
-/// Zeroes the contents of all XMM or YMM registers.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
-static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
-_mm256_zeroall(void)
-{
- __builtin_ia32_vzeroall();
-}
-
-/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
-static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
-_mm256_zeroupper(void)
-{
- __builtin_ia32_vzeroupper();
-}
-
-/* Vector load with broadcast */
-/// Loads a scalar single-precision floating point value from the
-/// specified address pointed to by \a __a and broadcasts it to the elements
-/// of a [4 x float] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
-///
-/// \param __a
-/// The single-precision floating point value to be broadcast.
-/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
-/// equal to the broadcast value.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_broadcast_ss(float const *__a)
-{
- float __f = *__a;
- return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
-}
-
-/// Loads a scalar double-precision floating point value from the
-/// specified address pointed to by \a __a and broadcasts it to the elements
-/// of a [4 x double] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
-///
-/// \param __a
-/// The double-precision floating point value to be broadcast.
-/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
-/// equal to the broadcast value.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_broadcast_sd(double const *__a)
-{
- double __d = *__a;
- return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
-}
-
-/// Loads a scalar single-precision floating point value from the
-/// specified address pointed to by \a __a and broadcasts it to the elements
-/// of a [8 x float] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
-///
-/// \param __a
-/// The single-precision floating point value to be broadcast.
-/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
-/// equal to the broadcast value.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_broadcast_ss(float const *__a)
-{
- float __f = *__a;
- return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
-}
-
-/// Loads the data from a 128-bit vector of [2 x double] from the
-/// specified address pointed to by \a __a and broadcasts it to 128-bit
-/// elements in a 256-bit vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
-///
-/// \param __a
-/// The 128-bit vector of [2 x double] to be broadcast.
-/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
-/// equal to the broadcast value.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_broadcast_pd(__m128d const *__a)
-{
- __m128d __b = _mm_loadu_pd((const double *)__a);
- return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
- 0, 1, 0, 1);
-}
-
-/// Loads the data from a 128-bit vector of [4 x float] from the
-/// specified address pointed to by \a __a and broadcasts it to 128-bit
-/// elements in a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
-///
-/// \param __a
-/// The 128-bit vector of [4 x float] to be broadcast.
-/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
-/// equal to the broadcast value.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_broadcast_ps(__m128 const *__a)
-{
- __m128 __b = _mm_loadu_ps((const float *)__a);
- return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
- 0, 1, 2, 3, 0, 1, 2, 3);
-}
-
-/* SIMD load ops */
-/// Loads 4 double-precision floating point values from a 32-byte aligned
-/// memory location pointed to by \a __p into a vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
-///
-/// \param __p
-/// A 32-byte aligned pointer to a memory location containing
-/// double-precision floating point values.
-/// \returns A 256-bit vector of [4 x double] containing the moved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_load_pd(double const *__p)
-{
- return *(const __m256d *)__p;
-}
-
-/// Loads 8 single-precision floating point values from a 32-byte aligned
-/// memory location pointed to by \a __p into a vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
-///
-/// \param __p
-/// A 32-byte aligned pointer to a memory location containing float values.
-/// \returns A 256-bit vector of [8 x float] containing the moved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_load_ps(float const *__p)
-{
- return *(const __m256 *)__p;
-}
-
-/// Loads 4 double-precision floating point values from an unaligned
-/// memory location pointed to by \a __p into a vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location containing double-precision floating
-/// point values.
-/// \returns A 256-bit vector of [4 x double] containing the moved values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_loadu_pd(double const *__p)
-{
- struct __loadu_pd {
- __m256d_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_pd*)__p)->__v;
-}
-
-/// Loads 8 single-precision floating point values from an unaligned
-/// memory location pointed to by \a __p into a vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location containing single-precision floating
-/// point values.
-/// \returns A 256-bit vector of [8 x float] containing the moved values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_loadu_ps(float const *__p)
-{
- struct __loadu_ps {
- __m256_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_ps*)__p)->__v;
-}
-
-/// Loads 256 bits of integer data from a 32-byte aligned memory
-/// location pointed to by \a __p into elements of a 256-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
-///
-/// \param __p
-/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
-/// values.
-/// \returns A 256-bit integer vector containing the moved values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_load_si256(__m256i const *__p)
-{
- return *__p;
-}
-
-/// Loads 256 bits of integer data from an unaligned memory location
-/// pointed to by \a __p into a 256-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
-///
-/// \param __p
-/// A pointer to a 256-bit integer vector containing integer values.
-/// \returns A 256-bit integer vector containing the moved values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_loadu_si256(__m256i_u const *__p)
-{
- struct __loadu_si256 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_si256*)__p)->__v;
-}
-
-/// Loads 256 bits of integer data from an unaligned memory location
-/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
-/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
-/// line boundary.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
-///
-/// \param __p
-/// A pointer to a 256-bit integer vector containing integer values.
-/// \returns A 256-bit integer vector containing the moved values.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_lddqu_si256(__m256i const *__p)
-{
- return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
-}
-
-/* SIMD store ops */
-/// Stores double-precision floating point values from a 256-bit vector
-/// of [4 x double] to a 32-byte aligned memory location pointed to by
-/// \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
-///
-/// \param __p
-/// A 32-byte aligned pointer to a memory location that will receive the
-/// double-precision floaing point values.
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_store_pd(double *__p, __m256d __a)
-{
- *(__m256d *)__p = __a;
-}
-
-/// Stores single-precision floating point values from a 256-bit vector
-/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
-///
-/// \param __p
-/// A 32-byte aligned pointer to a memory location that will receive the
-/// float values.
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_store_ps(float *__p, __m256 __a)
-{
- *(__m256 *)__p = __a;
-}
-
-/// Stores double-precision floating point values from a 256-bit vector
-/// of [4 x double] to an unaligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the double-precision
-/// floating point values.
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu_pd(double *__p, __m256d __a)
-{
- struct __storeu_pd {
- __m256d_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_pd*)__p)->__v = __a;
-}
-
-/// Stores single-precision floating point values from a 256-bit vector
-/// of [8 x float] to an unaligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the float values.
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu_ps(float *__p, __m256 __a)
-{
- struct __storeu_ps {
- __m256_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_ps*)__p)->__v = __a;
-}
-
-/// Stores integer values from a 256-bit integer vector to a 32-byte
-/// aligned memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
-///
-/// \param __p
-/// A 32-byte aligned pointer to a memory location that will receive the
-/// integer values.
-/// \param __a
-/// A 256-bit integer vector containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_store_si256(__m256i *__p, __m256i __a)
-{
- *__p = __a;
-}
-
-/// Stores integer values from a 256-bit integer vector to an unaligned
-/// memory location pointed to by \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the integer values.
-/// \param __a
-/// A 256-bit integer vector containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
-{
- struct __storeu_si256 {
- __m256i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_si256*)__p)->__v = __a;
-}
-
-/* Conditional load ops */
-/// Conditionally loads double-precision floating point elements from a
-/// memory location pointed to by \a __p into a 128-bit vector of
-/// [2 x double], depending on the mask bits associated with each data
-/// element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that contains the double-precision
-/// floating point values.
-/// \param __m
-/// A 128-bit integer vector containing the mask. The most significant bit of
-/// each data element represents the mask bits. If a mask bit is zero, the
-/// corresponding value in the memory location is not loaded and the
-/// corresponding field in the return value is set to zero.
-/// \returns A 128-bit vector of [2 x double] containing the loaded values.
-static __inline __m128d __DEFAULT_FN_ATTRS128
-_mm_maskload_pd(double const *__p, __m128i __m)
-{
- return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
-}
-
-/// Conditionally loads double-precision floating point elements from a
-/// memory location pointed to by \a __p into a 256-bit vector of
-/// [4 x double], depending on the mask bits associated with each data
-/// element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that contains the double-precision
-/// floating point values.
-/// \param __m
-/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
-/// significant bit of each quadword element represents the mask bits. If a
-/// mask bit is zero, the corresponding value in the memory location is not
-/// loaded and the corresponding field in the return value is set to zero.
-/// \returns A 256-bit vector of [4 x double] containing the loaded values.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_maskload_pd(double const *__p, __m256i __m)
-{
- return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
- (__v4di)__m);
-}
-
-/// Conditionally loads single-precision floating point elements from a
-/// memory location pointed to by \a __p into a 128-bit vector of
-/// [4 x float], depending on the mask bits associated with each data
-/// element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that contains the single-precision
-/// floating point values.
-/// \param __m
-/// A 128-bit integer vector containing the mask. The most significant bit of
-/// each data element represents the mask bits. If a mask bit is zero, the
-/// corresponding value in the memory location is not loaded and the
-/// corresponding field in the return value is set to zero.
-/// \returns A 128-bit vector of [4 x float] containing the loaded values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_maskload_ps(float const *__p, __m128i __m)
-{
- return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
-}
-
-/// Conditionally loads single-precision floating point elements from a
-/// memory location pointed to by \a __p into a 256-bit vector of
-/// [8 x float], depending on the mask bits associated with each data
-/// element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that contains the single-precision
-/// floating point values.
-/// \param __m
-/// A 256-bit integer vector of [8 x dword] containing the mask. The most
-/// significant bit of each dword element represents the mask bits. If a mask
-/// bit is zero, the corresponding value in the memory location is not loaded
-/// and the corresponding field in the return value is set to zero.
-/// \returns A 256-bit vector of [8 x float] containing the loaded values.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_maskload_ps(float const *__p, __m256i __m)
-{
- return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
-}
-
-/* Conditional store ops */
-/// Moves single-precision floating point values from a 256-bit vector
-/// of [8 x float] to a memory location pointed to by \a __p, according to
-/// the specified mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the float values.
-/// \param __m
-/// A 256-bit integer vector of [8 x dword] containing the mask. The most
-/// significant bit of each dword element in the mask vector represents the
-/// mask bits. If a mask bit is zero, the corresponding value from vector
-/// \a __a is not stored and the corresponding field in the memory location
-/// pointed to by \a __p is not changed.
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
-{
- __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
-}
-
-/// Moves double-precision values from a 128-bit vector of [2 x double]
-/// to a memory location pointed to by \a __p, according to the specified
-/// mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the float values.
-/// \param __m
-/// A 128-bit integer vector containing the mask. The most significant bit of
-/// each field in the mask vector represents the mask bits. If a mask bit is
-/// zero, the corresponding value from vector \a __a is not stored and the
-/// corresponding field in the memory location pointed to by \a __p is not
-/// changed.
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
-{
- __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
-}
-
-/// Moves double-precision values from a 256-bit vector of [4 x double]
-/// to a memory location pointed to by \a __p, according to the specified
-/// mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the float values.
-/// \param __m
-/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
-/// significant bit of each quadword element in the mask vector represents
-/// the mask bits. If a mask bit is zero, the corresponding value from vector
-/// __a is not stored and the corresponding field in the memory location
-/// pointed to by \a __p is not changed.
-/// \param __a
-/// A 256-bit vector of [4 x double] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
-{
- __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
-}
-
-/// Moves single-precision floating point values from a 128-bit vector
-/// of [4 x float] to a memory location pointed to by \a __p, according to
-/// the specified mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the float values.
-/// \param __m
-/// A 128-bit integer vector containing the mask. The most significant bit of
-/// each field in the mask vector represents the mask bits. If a mask bit is
-/// zero, the corresponding value from vector __a is not stored and the
-/// corresponding field in the memory location pointed to by \a __p is not
-/// changed.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline void __DEFAULT_FN_ATTRS128
-_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
-{
- __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
-}
-
-/* Cacheability support ops */
-/// Moves integer data from a 256-bit integer vector to a 32-byte
-/// aligned memory location. To minimize caching, the data is flagged as
-/// non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
-///
-/// \param __a
-/// A pointer to a 32-byte aligned memory location that will receive the
-/// integer values.
-/// \param __b
-/// A 256-bit integer vector containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_si256(__m256i *__a, __m256i __b)
-{
- typedef __v4di __v4di_aligned __attribute__((aligned(32)));
- __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
-}
-
-/// Moves double-precision values from a 256-bit vector of [4 x double]
-/// to a 32-byte aligned memory location. To minimize caching, the data is
-/// flagged as non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
-///
-/// \param __a
-/// A pointer to a 32-byte aligned memory location that will receive the
-/// double-precision floating-point values.
-/// \param __b
-/// A 256-bit vector of [4 x double] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_pd(double *__a, __m256d __b)
-{
- typedef __v4df __v4df_aligned __attribute__((aligned(32)));
- __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
-}
-
-/// Moves single-precision floating point values from a 256-bit vector
-/// of [8 x float] to a 32-byte aligned memory location. To minimize
-/// caching, the data is flagged as non-temporal (unlikely to be used again
-/// soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 32-byte aligned memory location that will receive the
-/// single-precision floating point values.
-/// \param __a
-/// A 256-bit vector of [8 x float] containing the values to be moved.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_ps(float *__p, __m256 __a)
-{
- typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
- __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
-}
-
-/* Create vectors */
-/// Create a 256-bit vector of [4 x double] with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 256-bit vector of [4 x double] containing undefined values.
-static __inline__ __m256d __DEFAULT_FN_ATTRS
-_mm256_undefined_pd(void)
-{
- return (__m256d)__builtin_ia32_undef256();
-}
-
-/// Create a 256-bit vector of [8 x float] with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 256-bit vector of [8 x float] containing undefined values.
-static __inline__ __m256 __DEFAULT_FN_ATTRS
-_mm256_undefined_ps(void)
-{
- return (__m256)__builtin_ia32_undef256();
-}
-
-/// Create a 256-bit integer vector with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 256-bit integer vector containing undefined values.
-static __inline__ __m256i __DEFAULT_FN_ATTRS
-_mm256_undefined_si256(void)
-{
- return (__m256i)__builtin_ia32_undef256();
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double]
-/// initialized with the specified double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
-/// instruction.
-///
-/// \param __a
-/// A double-precision floating-point value used to initialize bits [255:192]
-/// of the result.
-/// \param __b
-/// A double-precision floating-point value used to initialize bits [191:128]
-/// of the result.
-/// \param __c
-/// A double-precision floating-point value used to initialize bits [127:64]
-/// of the result.
-/// \param __d
-/// A double-precision floating-point value used to initialize bits [63:0]
-/// of the result.
-/// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set_pd(double __a, double __b, double __c, double __d)
-{
- return __extension__ (__m256d){ __d, __c, __b, __a };
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] initialized
-/// with the specified single-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __a
-/// A single-precision floating-point value used to initialize bits [255:224]
-/// of the result.
-/// \param __b
-/// A single-precision floating-point value used to initialize bits [223:192]
-/// of the result.
-/// \param __c
-/// A single-precision floating-point value used to initialize bits [191:160]
-/// of the result.
-/// \param __d
-/// A single-precision floating-point value used to initialize bits [159:128]
-/// of the result.
-/// \param __e
-/// A single-precision floating-point value used to initialize bits [127:96]
-/// of the result.
-/// \param __f
-/// A single-precision floating-point value used to initialize bits [95:64]
-/// of the result.
-/// \param __g
-/// A single-precision floating-point value used to initialize bits [63:32]
-/// of the result.
-/// \param __h
-/// A single-precision floating-point value used to initialize bits [31:0]
-/// of the result.
-/// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set_ps(float __a, float __b, float __c, float __d,
- float __e, float __f, float __g, float __h)
-{
- return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-/// 32-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i0
-/// A 32-bit integral value used to initialize bits [255:224] of the result.
-/// \param __i1
-/// A 32-bit integral value used to initialize bits [223:192] of the result.
-/// \param __i2
-/// A 32-bit integral value used to initialize bits [191:160] of the result.
-/// \param __i3
-/// A 32-bit integral value used to initialize bits [159:128] of the result.
-/// \param __i4
-/// A 32-bit integral value used to initialize bits [127:96] of the result.
-/// \param __i5
-/// A 32-bit integral value used to initialize bits [95:64] of the result.
-/// \param __i6
-/// A 32-bit integral value used to initialize bits [63:32] of the result.
-/// \param __i7
-/// A 32-bit integral value used to initialize bits [31:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
- int __i4, int __i5, int __i6, int __i7)
-{
- return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-/// 16-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __w15
-/// A 16-bit integral value used to initialize bits [255:240] of the result.
-/// \param __w14
-/// A 16-bit integral value used to initialize bits [239:224] of the result.
-/// \param __w13
-/// A 16-bit integral value used to initialize bits [223:208] of the result.
-/// \param __w12
-/// A 16-bit integral value used to initialize bits [207:192] of the result.
-/// \param __w11
-/// A 16-bit integral value used to initialize bits [191:176] of the result.
-/// \param __w10
-/// A 16-bit integral value used to initialize bits [175:160] of the result.
-/// \param __w09
-/// A 16-bit integral value used to initialize bits [159:144] of the result.
-/// \param __w08
-/// A 16-bit integral value used to initialize bits [143:128] of the result.
-/// \param __w07
-/// A 16-bit integral value used to initialize bits [127:112] of the result.
-/// \param __w06
-/// A 16-bit integral value used to initialize bits [111:96] of the result.
-/// \param __w05
-/// A 16-bit integral value used to initialize bits [95:80] of the result.
-/// \param __w04
-/// A 16-bit integral value used to initialize bits [79:64] of the result.
-/// \param __w03
-/// A 16-bit integral value used to initialize bits [63:48] of the result.
-/// \param __w02
-/// A 16-bit integral value used to initialize bits [47:32] of the result.
-/// \param __w01
-/// A 16-bit integral value used to initialize bits [31:16] of the result.
-/// \param __w00
-/// A 16-bit integral value used to initialize bits [15:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
- short __w11, short __w10, short __w09, short __w08,
- short __w07, short __w06, short __w05, short __w04,
- short __w03, short __w02, short __w01, short __w00)
-{
- return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
- __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-/// 8-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b31
-/// An 8-bit integral value used to initialize bits [255:248] of the result.
-/// \param __b30
-/// An 8-bit integral value used to initialize bits [247:240] of the result.
-/// \param __b29
-/// An 8-bit integral value used to initialize bits [239:232] of the result.
-/// \param __b28
-/// An 8-bit integral value used to initialize bits [231:224] of the result.
-/// \param __b27
-/// An 8-bit integral value used to initialize bits [223:216] of the result.
-/// \param __b26
-/// An 8-bit integral value used to initialize bits [215:208] of the result.
-/// \param __b25
-/// An 8-bit integral value used to initialize bits [207:200] of the result.
-/// \param __b24
-/// An 8-bit integral value used to initialize bits [199:192] of the result.
-/// \param __b23
-/// An 8-bit integral value used to initialize bits [191:184] of the result.
-/// \param __b22
-/// An 8-bit integral value used to initialize bits [183:176] of the result.
-/// \param __b21
-/// An 8-bit integral value used to initialize bits [175:168] of the result.
-/// \param __b20
-/// An 8-bit integral value used to initialize bits [167:160] of the result.
-/// \param __b19
-/// An 8-bit integral value used to initialize bits [159:152] of the result.
-/// \param __b18
-/// An 8-bit integral value used to initialize bits [151:144] of the result.
-/// \param __b17
-/// An 8-bit integral value used to initialize bits [143:136] of the result.
-/// \param __b16
-/// An 8-bit integral value used to initialize bits [135:128] of the result.
-/// \param __b15
-/// An 8-bit integral value used to initialize bits [127:120] of the result.
-/// \param __b14
-/// An 8-bit integral value used to initialize bits [119:112] of the result.
-/// \param __b13
-/// An 8-bit integral value used to initialize bits [111:104] of the result.
-/// \param __b12
-/// An 8-bit integral value used to initialize bits [103:96] of the result.
-/// \param __b11
-/// An 8-bit integral value used to initialize bits [95:88] of the result.
-/// \param __b10
-/// An 8-bit integral value used to initialize bits [87:80] of the result.
-/// \param __b09
-/// An 8-bit integral value used to initialize bits [79:72] of the result.
-/// \param __b08
-/// An 8-bit integral value used to initialize bits [71:64] of the result.
-/// \param __b07
-/// An 8-bit integral value used to initialize bits [63:56] of the result.
-/// \param __b06
-/// An 8-bit integral value used to initialize bits [55:48] of the result.
-/// \param __b05
-/// An 8-bit integral value used to initialize bits [47:40] of the result.
-/// \param __b04
-/// An 8-bit integral value used to initialize bits [39:32] of the result.
-/// \param __b03
-/// An 8-bit integral value used to initialize bits [31:24] of the result.
-/// \param __b02
-/// An 8-bit integral value used to initialize bits [23:16] of the result.
-/// \param __b01
-/// An 8-bit integral value used to initialize bits [15:8] of the result.
-/// \param __b00
-/// An 8-bit integral value used to initialize bits [7:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
- char __b27, char __b26, char __b25, char __b24,
- char __b23, char __b22, char __b21, char __b20,
- char __b19, char __b18, char __b17, char __b16,
- char __b15, char __b14, char __b13, char __b12,
- char __b11, char __b10, char __b09, char __b08,
- char __b07, char __b06, char __b05, char __b04,
- char __b03, char __b02, char __b01, char __b00)
-{
- return __extension__ (__m256i)(__v32qi){
- __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
- __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
- __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
- __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
- };
-}
-
-/// Constructs a 256-bit integer vector initialized with the specified
-/// 64-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
-/// instruction.
-///
-/// \param __a
-/// A 64-bit integral value used to initialize bits [255:192] of the result.
-/// \param __b
-/// A 64-bit integral value used to initialize bits [191:128] of the result.
-/// \param __c
-/// A 64-bit integral value used to initialize bits [127:64] of the result.
-/// \param __d
-/// A 64-bit integral value used to initialize bits [63:0] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
-{
- return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
-}
-
-/* Create vectors with elements in reverse order */
-/// Constructs a 256-bit floating-point vector of [4 x double],
-/// initialized in reverse order with the specified double-precision
-/// floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
-/// instruction.
-///
-/// \param __a
-/// A double-precision floating-point value used to initialize bits [63:0]
-/// of the result.
-/// \param __b
-/// A double-precision floating-point value used to initialize bits [127:64]
-/// of the result.
-/// \param __c
-/// A double-precision floating-point value used to initialize bits [191:128]
-/// of the result.
-/// \param __d
-/// A double-precision floating-point value used to initialize bits [255:192]
-/// of the result.
-/// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setr_pd(double __a, double __b, double __c, double __d)
-{
- return _mm256_set_pd(__d, __c, __b, __a);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float],
-/// initialized in reverse order with the specified single-precision
-/// float-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __a
-/// A single-precision floating-point value used to initialize bits [31:0]
-/// of the result.
-/// \param __b
-/// A single-precision floating-point value used to initialize bits [63:32]
-/// of the result.
-/// \param __c
-/// A single-precision floating-point value used to initialize bits [95:64]
-/// of the result.
-/// \param __d
-/// A single-precision floating-point value used to initialize bits [127:96]
-/// of the result.
-/// \param __e
-/// A single-precision floating-point value used to initialize bits [159:128]
-/// of the result.
-/// \param __f
-/// A single-precision floating-point value used to initialize bits [191:160]
-/// of the result.
-/// \param __g
-/// A single-precision floating-point value used to initialize bits [223:192]
-/// of the result.
-/// \param __h
-/// A single-precision floating-point value used to initialize bits [255:224]
-/// of the result.
-/// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setr_ps(float __a, float __b, float __c, float __d,
- float __e, float __f, float __g, float __h)
-{
- return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-/// with the specified 32-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i0
-/// A 32-bit integral value used to initialize bits [31:0] of the result.
-/// \param __i1
-/// A 32-bit integral value used to initialize bits [63:32] of the result.
-/// \param __i2
-/// A 32-bit integral value used to initialize bits [95:64] of the result.
-/// \param __i3
-/// A 32-bit integral value used to initialize bits [127:96] of the result.
-/// \param __i4
-/// A 32-bit integral value used to initialize bits [159:128] of the result.
-/// \param __i5
-/// A 32-bit integral value used to initialize bits [191:160] of the result.
-/// \param __i6
-/// A 32-bit integral value used to initialize bits [223:192] of the result.
-/// \param __i7
-/// A 32-bit integral value used to initialize bits [255:224] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
- int __i4, int __i5, int __i6, int __i7)
-{
- return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-/// with the specified 16-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __w15
-/// A 16-bit integral value used to initialize bits [15:0] of the result.
-/// \param __w14
-/// A 16-bit integral value used to initialize bits [31:16] of the result.
-/// \param __w13
-/// A 16-bit integral value used to initialize bits [47:32] of the result.
-/// \param __w12
-/// A 16-bit integral value used to initialize bits [63:48] of the result.
-/// \param __w11
-/// A 16-bit integral value used to initialize bits [79:64] of the result.
-/// \param __w10
-/// A 16-bit integral value used to initialize bits [95:80] of the result.
-/// \param __w09
-/// A 16-bit integral value used to initialize bits [111:96] of the result.
-/// \param __w08
-/// A 16-bit integral value used to initialize bits [127:112] of the result.
-/// \param __w07
-/// A 16-bit integral value used to initialize bits [143:128] of the result.
-/// \param __w06
-/// A 16-bit integral value used to initialize bits [159:144] of the result.
-/// \param __w05
-/// A 16-bit integral value used to initialize bits [175:160] of the result.
-/// \param __w04
-/// A 16-bit integral value used to initialize bits [191:176] of the result.
-/// \param __w03
-/// A 16-bit integral value used to initialize bits [207:192] of the result.
-/// \param __w02
-/// A 16-bit integral value used to initialize bits [223:208] of the result.
-/// \param __w01
-/// A 16-bit integral value used to initialize bits [239:224] of the result.
-/// \param __w00
-/// A 16-bit integral value used to initialize bits [255:240] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
- short __w11, short __w10, short __w09, short __w08,
- short __w07, short __w06, short __w05, short __w04,
- short __w03, short __w02, short __w01, short __w00)
-{
- return _mm256_set_epi16(__w00, __w01, __w02, __w03,
- __w04, __w05, __w06, __w07,
- __w08, __w09, __w10, __w11,
- __w12, __w13, __w14, __w15);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-/// with the specified 8-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b31
-/// An 8-bit integral value used to initialize bits [7:0] of the result.
-/// \param __b30
-/// An 8-bit integral value used to initialize bits [15:8] of the result.
-/// \param __b29
-/// An 8-bit integral value used to initialize bits [23:16] of the result.
-/// \param __b28
-/// An 8-bit integral value used to initialize bits [31:24] of the result.
-/// \param __b27
-/// An 8-bit integral value used to initialize bits [39:32] of the result.
-/// \param __b26
-/// An 8-bit integral value used to initialize bits [47:40] of the result.
-/// \param __b25
-/// An 8-bit integral value used to initialize bits [55:48] of the result.
-/// \param __b24
-/// An 8-bit integral value used to initialize bits [63:56] of the result.
-/// \param __b23
-/// An 8-bit integral value used to initialize bits [71:64] of the result.
-/// \param __b22
-/// An 8-bit integral value used to initialize bits [79:72] of the result.
-/// \param __b21
-/// An 8-bit integral value used to initialize bits [87:80] of the result.
-/// \param __b20
-/// An 8-bit integral value used to initialize bits [95:88] of the result.
-/// \param __b19
-/// An 8-bit integral value used to initialize bits [103:96] of the result.
-/// \param __b18
-/// An 8-bit integral value used to initialize bits [111:104] of the result.
-/// \param __b17
-/// An 8-bit integral value used to initialize bits [119:112] of the result.
-/// \param __b16
-/// An 8-bit integral value used to initialize bits [127:120] of the result.
-/// \param __b15
-/// An 8-bit integral value used to initialize bits [135:128] of the result.
-/// \param __b14
-/// An 8-bit integral value used to initialize bits [143:136] of the result.
-/// \param __b13
-/// An 8-bit integral value used to initialize bits [151:144] of the result.
-/// \param __b12
-/// An 8-bit integral value used to initialize bits [159:152] of the result.
-/// \param __b11
-/// An 8-bit integral value used to initialize bits [167:160] of the result.
-/// \param __b10
-/// An 8-bit integral value used to initialize bits [175:168] of the result.
-/// \param __b09
-/// An 8-bit integral value used to initialize bits [183:176] of the result.
-/// \param __b08
-/// An 8-bit integral value used to initialize bits [191:184] of the result.
-/// \param __b07
-/// An 8-bit integral value used to initialize bits [199:192] of the result.
-/// \param __b06
-/// An 8-bit integral value used to initialize bits [207:200] of the result.
-/// \param __b05
-/// An 8-bit integral value used to initialize bits [215:208] of the result.
-/// \param __b04
-/// An 8-bit integral value used to initialize bits [223:216] of the result.
-/// \param __b03
-/// An 8-bit integral value used to initialize bits [231:224] of the result.
-/// \param __b02
-/// An 8-bit integral value used to initialize bits [239:232] of the result.
-/// \param __b01
-/// An 8-bit integral value used to initialize bits [247:240] of the result.
-/// \param __b00
-/// An 8-bit integral value used to initialize bits [255:248] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
- char __b27, char __b26, char __b25, char __b24,
- char __b23, char __b22, char __b21, char __b20,
- char __b19, char __b18, char __b17, char __b16,
- char __b15, char __b14, char __b13, char __b12,
- char __b11, char __b10, char __b09, char __b08,
- char __b07, char __b06, char __b05, char __b04,
- char __b03, char __b02, char __b01, char __b00)
-{
- return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
- __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
- __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
- __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
-}
-
-/// Constructs a 256-bit integer vector, initialized in reverse order
-/// with the specified 64-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
-/// instruction.
-///
-/// \param __a
-/// A 64-bit integral value used to initialize bits [63:0] of the result.
-/// \param __b
-/// A 64-bit integral value used to initialize bits [127:64] of the result.
-/// \param __c
-/// A 64-bit integral value used to initialize bits [191:128] of the result.
-/// \param __d
-/// A 64-bit integral value used to initialize bits [255:192] of the result.
-/// \returns An initialized 256-bit integer vector.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
-{
- return _mm256_set_epi64x(__d, __c, __b, __a);
-}
-
-/* Create vectors with repeated elements */
-/// Constructs a 256-bit floating-point vector of [4 x double], with each
-/// of the four double-precision floating-point vector elements set to the
-/// specified double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
-///
-/// \param __w
-/// A double-precision floating-point value used to initialize each vector
-/// element of the result.
-/// \returns An initialized 256-bit floating-point vector of [4 x double].
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set1_pd(double __w)
-{
- return _mm256_set_pd(__w, __w, __w, __w);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float], with each
-/// of the eight single-precision floating-point vector elements set to the
-/// specified single-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
-/// instruction.
-///
-/// \param __w
-/// A single-precision floating-point value used to initialize each vector
-/// element of the result.
-/// \returns An initialized 256-bit floating-point vector of [8 x float].
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set1_ps(float __w)
-{
- return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
-}
-
-/// Constructs a 256-bit integer vector of [8 x i32], with each of the
-/// 32-bit integral vector elements set to the specified 32-bit integral
-/// value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
-/// instruction.
-///
-/// \param __i
-/// A 32-bit integral value used to initialize each vector element of the
-/// result.
-/// \returns An initialized 256-bit integer vector of [8 x i32].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi32(int __i)
-{
- return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
-}
-
-/// Constructs a 256-bit integer vector of [16 x i16], with each of the
-/// 16-bit integral vector elements set to the specified 16-bit integral
-/// value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
-///
-/// \param __w
-/// A 16-bit integral value used to initialize each vector element of the
-/// result.
-/// \returns An initialized 256-bit integer vector of [16 x i16].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi16(short __w)
-{
- return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
- __w, __w, __w, __w, __w, __w, __w, __w);
-}
-
-/// Constructs a 256-bit integer vector of [32 x i8], with each of the
-/// 8-bit integral vector elements set to the specified 8-bit integral value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
-///
-/// \param __b
-/// An 8-bit integral value used to initialize each vector element of the
-/// result.
-/// \returns An initialized 256-bit integer vector of [32 x i8].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi8(char __b)
-{
- return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
- __b, __b, __b, __b, __b, __b, __b, __b,
- __b, __b, __b, __b, __b, __b, __b, __b,
- __b, __b, __b, __b, __b, __b, __b, __b);
-}
-
-/// Constructs a 256-bit integer vector of [4 x i64], with each of the
-/// 64-bit integral vector elements set to the specified 64-bit integral
-/// value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
-///
-/// \param __q
-/// A 64-bit integral value used to initialize each vector element of the
-/// result.
-/// \returns An initialized 256-bit integer vector of [4 x i64].
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set1_epi64x(long long __q)
-{
- return _mm256_set_epi64x(__q, __q, __q, __q);
-}
-
-/* Create __zeroed vectors */
-/// Constructs a 256-bit floating-point vector of [4 x double] with all
-/// vector elements initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setzero_pd(void)
-{
- return __extension__ (__m256d){ 0, 0, 0, 0 };
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] with all
-/// vector elements initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setzero_ps(void)
-{
- return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
-}
-
-/// Constructs a 256-bit integer vector initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
-///
-/// \returns A 256-bit integer vector initialized to zero.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setzero_si256(void)
-{
- return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
-}
-
-/* Cast between vector types */
-/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
-/// floating-point vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [4 x double].
-/// \returns A 256-bit floating-point vector of [8 x float] containing the same
-/// bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_castpd_ps(__m256d __a)
-{
- return (__m256)__a;
-}
-
-/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
-/// integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [4 x double].
-/// \returns A 256-bit integer vector containing the same bitwise pattern as the
-/// parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_castpd_si256(__m256d __a)
-{
- return (__m256i)__a;
-}
-
-/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
-/// floating-point vector of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [8 x float].
-/// \returns A 256-bit floating-point vector of [4 x double] containing the same
-/// bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_castps_pd(__m256 __a)
-{
- return (__m256d)__a;
-}
-
-/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
-/// integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [8 x float].
-/// \returns A 256-bit integer vector containing the same bitwise pattern as the
-/// parameter.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_castps_si256(__m256 __a)
-{
- return (__m256i)__a;
-}
-
-/// Casts a 256-bit integer vector into a 256-bit floating-point vector
-/// of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the same
-/// bitwise pattern as the parameter.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_castsi256_ps(__m256i __a)
-{
- return (__m256)__a;
-}
-
-/// Casts a 256-bit integer vector into a 256-bit floating-point vector
-/// of [4 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the same
-/// bitwise pattern as the parameter.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_castsi256_pd(__m256i __a)
-{
- return (__m256d)__a;
-}
-
-/// Returns the lower 128 bits of a 256-bit floating-point vector of
-/// [4 x double] as a 128-bit floating-point vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [4 x double].
-/// \returns A 128-bit floating-point vector of [2 x double] containing the
-/// lower 128 bits of the parameter.
-static __inline __m128d __DEFAULT_FN_ATTRS
-_mm256_castpd256_pd128(__m256d __a)
-{
- return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
-}
-
-/// Returns the lower 128 bits of a 256-bit floating-point vector of
-/// [8 x float] as a 128-bit floating-point vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit floating-point vector of [8 x float].
-/// \returns A 128-bit floating-point vector of [4 x float] containing the
-/// lower 128 bits of the parameter.
-static __inline __m128 __DEFAULT_FN_ATTRS
-_mm256_castps256_ps128(__m256 __a)
-{
- return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
-}
-
-/// Truncates a 256-bit integer vector into a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 256-bit integer vector.
-/// \returns A 128-bit integer vector containing the lower 128 bits of the
-/// parameter.
-static __inline __m128i __DEFAULT_FN_ATTRS
-_mm256_castsi256_si128(__m256i __a)
-{
- return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] from a
-/// 128-bit floating-point vector of [2 x double].
-///
-/// The lower 128 bits contain the value of the source vector. The contents
-/// of the upper 128 bits are undefined.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
-/// contain the value of the parameter. The contents of the upper 128 bits
-/// are undefined.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_castpd128_pd256(__m128d __a)
-{
- return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] from a
-/// 128-bit floating-point vector of [4 x float].
-///
-/// The lower 128 bits contain the value of the source vector. The contents
-/// of the upper 128 bits are undefined.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
-/// contain the value of the parameter. The contents of the upper 128 bits
-/// are undefined.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_castps128_ps256(__m128 __a)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
-}
-
-/// Constructs a 256-bit integer vector from a 128-bit integer vector.
-///
-/// The lower 128 bits contain the value of the source vector. The contents
-/// of the upper 128 bits are undefined.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
-/// the parameter. The contents of the upper 128 bits are undefined.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_castsi128_si256(__m128i __a)
-{
- return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] from a
-/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
-/// contain the value of the source vector. The upper 128 bits are set
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
-/// contain the value of the parameter. The upper 128 bits are set to zero.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_zextpd128_pd256(__m128d __a)
-{
- return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] from a
-/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
-/// the value of the source vector. The upper 128 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
-/// contain the value of the parameter. The upper 128 bits are set to zero.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_zextps128_ps256(__m128 __a)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/// Constructs a 256-bit integer vector from a 128-bit integer vector.
-/// The lower 128 bits contain the value of the source vector. The upper
-/// 128 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
-/// the parameter. The upper 128 bits are set to zero.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_zextsi128_si256(__m128i __a)
-{
- return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
-}
-
-/*
- Vector insert.
- We use macros rather than inlines because we only want to accept
- invocations where the immediate M is a constant expression.
-*/
-/// Constructs a new 256-bit vector of [8 x float] by first duplicating
-/// a 256-bit vector of [8 x float] given in the first parameter, and then
-/// replacing either the upper or the lower 128 bits with the contents of a
-/// 128-bit vector of [4 x float] in the second parameter.
-///
-/// The immediate integer parameter determines between the upper or the lower
-/// 128 bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param V1
-/// A 256-bit vector of [8 x float]. This vector is copied to the result
-/// first, and then either the upper or the lower 128 bits of the result will
-/// be replaced by the contents of \a V2.
-/// \param V2
-/// A 128-bit vector of [4 x float]. The contents of this parameter are
-/// written to either the upper or the lower 128 bits of the result depending
-/// on the value of parameter \a M.
-/// \param M
-/// An immediate integer. The least significant bit determines how the values
-/// from the two parameters are interleaved: \n
-/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
-/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
-/// result. \n
-/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
-/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
-/// result.
-/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
-#define _mm256_insertf128_ps(V1, V2, M) \
- ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
- (__v4sf)(__m128)(V2), (int)(M)))
-
-/// Constructs a new 256-bit vector of [4 x double] by first duplicating
-/// a 256-bit vector of [4 x double] given in the first parameter, and then
-/// replacing either the upper or the lower 128 bits with the contents of a
-/// 128-bit vector of [2 x double] in the second parameter.
-///
-/// The immediate integer parameter determines between the upper or the lower
-/// 128 bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param V1
-/// A 256-bit vector of [4 x double]. This vector is copied to the result
-/// first, and then either the upper or the lower 128 bits of the result will
-/// be replaced by the contents of \a V2.
-/// \param V2
-/// A 128-bit vector of [2 x double]. The contents of this parameter are
-/// written to either the upper or the lower 128 bits of the result depending
-/// on the value of parameter \a M.
-/// \param M
-/// An immediate integer. The least significant bit determines how the values
-/// from the two parameters are interleaved: \n
-/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
-/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
-/// result. \n
-/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
-/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
-/// result.
-/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
-#define _mm256_insertf128_pd(V1, V2, M) \
- ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
- (__v2df)(__m128d)(V2), (int)(M)))
-
-/// Constructs a new 256-bit integer vector by first duplicating a
-/// 256-bit integer vector given in the first parameter, and then replacing
-/// either the upper or the lower 128 bits with the contents of a 128-bit
-/// integer vector in the second parameter.
-///
-/// The immediate integer parameter determines between the upper or the lower
-/// 128 bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param V1
-/// A 256-bit integer vector. This vector is copied to the result first, and
-/// then either the upper or the lower 128 bits of the result will be
-/// replaced by the contents of \a V2.
-/// \param V2
-/// A 128-bit integer vector. The contents of this parameter are written to
-/// either the upper or the lower 128 bits of the result depending on the
-/// value of parameter \a M.
-/// \param M
-/// An immediate integer. The least significant bit determines how the values
-/// from the two parameters are interleaved: \n
-/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
-/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
-/// result. \n
-/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
-/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
-/// result.
-/// \returns A 256-bit integer vector containing the interleaved values.
-#define _mm256_insertf128_si256(V1, V2, M) \
- ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
- (__v4si)(__m128i)(V2), (int)(M)))
-
-/*
- Vector extract.
- We use macros rather than inlines because we only want to accept
- invocations where the immediate M is a constant expression.
-*/
-/// Extracts either the upper or the lower 128 bits from a 256-bit vector
-/// of [8 x float], as determined by the immediate integer parameter, and
-/// returns the extracted bits as a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [8 x float].
-/// \param M
-/// An immediate integer. The least significant bit determines which bits are
-/// extracted from the first parameter: \n
-/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
-/// result. \n
-/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
-/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
-#define _mm256_extractf128_ps(V, M) \
- ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
-
-/// Extracts either the upper or the lower 128 bits from a 256-bit vector
-/// of [4 x double], as determined by the immediate integer parameter, and
-/// returns the extracted bits as a 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
-///
-/// \param V
-/// A 256-bit vector of [4 x double].
-/// \param M
-/// An immediate integer. The least significant bit determines which bits are
-/// extracted from the first parameter: \n
-/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
-/// result. \n
-/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
-/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
-#define _mm256_extractf128_pd(V, M) \
- ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
-
-/// Extracts either the upper or the lower 128 bits from a 256-bit
-/// integer vector, as determined by the immediate integer parameter, and
-/// returns the extracted bits as a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
-///
-/// \param V
-/// A 256-bit integer vector.
-/// \param M
-/// An immediate integer. The least significant bit determines which bits are
-/// extracted from the first parameter: \n
-/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
-/// result. \n
-/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
-/// \returns A 128-bit integer vector containing the extracted bits.
-#define _mm256_extractf128_si256(V, M) \
- ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
-
-/// Constructs a 256-bit floating-point vector of [8 x float] by
-/// concatenating two 128-bit floating-point vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __hi
-/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
-/// 128 bits of the result.
-/// \param __lo
-/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
-/// 128 bits of the result.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the
-/// concatenated result.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_set_m128 (__m128 __hi, __m128 __lo)
-{
- return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] by
-/// concatenating two 128-bit floating-point vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __hi
-/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
-/// 128 bits of the result.
-/// \param __lo
-/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
-/// 128 bits of the result.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the
-/// concatenated result.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_set_m128d (__m128d __hi, __m128d __lo)
-{
- return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit integer vector by concatenating two 128-bit
-/// integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __hi
-/// A 128-bit integer vector to be copied to the upper 128 bits of the
-/// result.
-/// \param __lo
-/// A 128-bit integer vector to be copied to the lower 128 bits of the
-/// result.
-/// \returns A 256-bit integer vector containing the concatenated result.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_set_m128i (__m128i __hi, __m128i __lo)
-{
- return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
-}
-
-/// Constructs a 256-bit floating-point vector of [8 x float] by
-/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
-/// similar to _mm256_set_m128, but the order of the input parameters is
-/// swapped.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __lo
-/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
-/// 128 bits of the result.
-/// \param __hi
-/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
-/// 128 bits of the result.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the
-/// concatenated result.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_setr_m128 (__m128 __lo, __m128 __hi)
-{
- return _mm256_set_m128(__hi, __lo);
-}
-
-/// Constructs a 256-bit floating-point vector of [4 x double] by
-/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
-/// similar to _mm256_set_m128d, but the order of the input parameters is
-/// swapped.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __lo
-/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
-/// 128 bits of the result.
-/// \param __hi
-/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
-/// 128 bits of the result.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the
-/// concatenated result.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_setr_m128d (__m128d __lo, __m128d __hi)
-{
- return (__m256d)_mm256_set_m128d(__hi, __lo);
-}
-
-/// Constructs a 256-bit integer vector by concatenating two 128-bit
-/// integer vectors. This is similar to _mm256_set_m128i, but the order of
-/// the input parameters is swapped.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
-///
-/// \param __lo
-/// A 128-bit integer vector to be copied to the lower 128 bits of the
-/// result.
-/// \param __hi
-/// A 128-bit integer vector to be copied to the upper 128 bits of the
-/// result.
-/// \returns A 256-bit integer vector containing the concatenated result.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_setr_m128i (__m128i __lo, __m128i __hi)
-{
- return (__m256i)_mm256_set_m128i(__hi, __lo);
-}
-
-/* SIMD load ops (unaligned) */
-/// Loads two 128-bit floating-point vectors of [4 x float] from
-/// unaligned memory locations and constructs a 256-bit floating-point vector
-/// of [8 x float] by concatenating the two 128-bit vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-/// <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-/// A pointer to a 128-bit memory location containing 4 consecutive
-/// single-precision floating-point values. These values are to be copied to
-/// bits[255:128] of the result. The address of the memory location does not
-/// have to be aligned.
-/// \param __addr_lo
-/// A pointer to a 128-bit memory location containing 4 consecutive
-/// single-precision floating-point values. These values are to be copied to
-/// bits[127:0] of the result. The address of the memory location does not
-/// have to be aligned.
-/// \returns A 256-bit floating-point vector of [8 x float] containing the
-/// concatenated result.
-static __inline __m256 __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo)
-{
- return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
-}
-
-/// Loads two 128-bit floating-point vectors of [2 x double] from
-/// unaligned memory locations and constructs a 256-bit floating-point vector
-/// of [4 x double] by concatenating the two 128-bit vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-/// <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-/// A pointer to a 128-bit memory location containing two consecutive
-/// double-precision floating-point values. These values are to be copied to
-/// bits[255:128] of the result. The address of the memory location does not
-/// have to be aligned.
-/// \param __addr_lo
-/// A pointer to a 128-bit memory location containing two consecutive
-/// double-precision floating-point values. These values are to be copied to
-/// bits[127:0] of the result. The address of the memory location does not
-/// have to be aligned.
-/// \returns A 256-bit floating-point vector of [4 x double] containing the
-/// concatenated result.
-static __inline __m256d __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo)
-{
- return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
-}
-
-/// Loads two 128-bit integer vectors from unaligned memory locations and
-/// constructs a 256-bit integer vector by concatenating the two 128-bit
-/// vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to load instructions followed by the
-/// <c> VINSERTF128 </c> instruction.
-///
-/// \param __addr_hi
-/// A pointer to a 128-bit memory location containing a 128-bit integer
-/// vector. This vector is to be copied to bits[255:128] of the result. The
-/// address of the memory location does not have to be aligned.
-/// \param __addr_lo
-/// A pointer to a 128-bit memory location containing a 128-bit integer
-/// vector. This vector is to be copied to bits[127:0] of the result. The
-/// address of the memory location does not have to be aligned.
-/// \returns A 256-bit integer vector containing the concatenated result.
-static __inline __m256i __DEFAULT_FN_ATTRS
-_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo)
-{
- return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
-}
-
-/* SIMD store ops (unaligned) */
-/// Stores the upper and lower 128 bits of a 256-bit floating-point
-/// vector of [8 x float] into two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-/// store instructions.
-///
-/// \param __addr_hi
-/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-/// copied to this memory location. The address of this memory location does
-/// not have to be aligned.
-/// \param __addr_lo
-/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-/// copied to this memory location. The address of this memory location does
-/// not have to be aligned.
-/// \param __a
-/// A 256-bit floating-point vector of [8 x float].
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a)
-{
- __m128 __v128;
-
- __v128 = _mm256_castps256_ps128(__a);
- _mm_storeu_ps(__addr_lo, __v128);
- __v128 = _mm256_extractf128_ps(__a, 1);
- _mm_storeu_ps(__addr_hi, __v128);
-}
-
-/// Stores the upper and lower 128 bits of a 256-bit floating-point
-/// vector of [4 x double] into two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-/// store instructions.
-///
-/// \param __addr_hi
-/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-/// copied to this memory location. The address of this memory location does
-/// not have to be aligned.
-/// \param __addr_lo
-/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-/// copied to this memory location. The address of this memory location does
-/// not have to be aligned.
-/// \param __a
-/// A 256-bit floating-point vector of [4 x double].
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a)
-{
- __m128d __v128;
-
- __v128 = _mm256_castpd256_pd128(__a);
- _mm_storeu_pd(__addr_lo, __v128);
- __v128 = _mm256_extractf128_pd(__a, 1);
- _mm_storeu_pd(__addr_hi, __v128);
-}
-
-/// Stores the upper and lower 128 bits of a 256-bit integer vector into
-/// two different unaligned memory locations.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
-/// store instructions.
-///
-/// \param __addr_hi
-/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
-/// copied to this memory location. The address of this memory location does
-/// not have to be aligned.
-/// \param __addr_lo
-/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
-/// copied to this memory location. The address of this memory location does
-/// not have to be aligned.
-/// \param __a
-/// A 256-bit integer vector.
-static __inline void __DEFAULT_FN_ATTRS
-_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a)
-{
- __m128i __v128;
-
- __v128 = _mm256_castsi256_si128(__a);
- _mm_storeu_si128(__addr_lo, __v128);
- __v128 = _mm256_extractf128_si256(__a, 1);
- _mm_storeu_si128(__addr_hi, __v128);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS128
-
-#endif /* __AVXINTRIN_H */
+++ /dev/null
-/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
- *
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __AVXVNNIINTRIN_H
-#define __AVXVNNIINTRIN_H
-
-/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
-/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-
-/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 7
-/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// DST[MAX:256] := 0
-/// \endoperation
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
- return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
-/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
-/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
-/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
-/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
-/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
-/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
-/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
-/// and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
-/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
-/// using signed saturation, and store the packed 32-bit results in DST.
-///
-/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
-///
-/// \operation
-/// FOR j := 0 to 3
-/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
-/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
-/// ENDFOR
-/// DST[MAX:128] := 0
-/// \endoperation
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif // __AVXVNNIINTRIN_H
+++ /dev/null
-/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __BMI2INTRIN_H
-#define __BMI2INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2")))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bzhi_u32(unsigned int __X, unsigned int __Y)
-{
- return __builtin_ia32_bzhi_si(__X, __Y);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pdep_u32(unsigned int __X, unsigned int __Y)
-{
- return __builtin_ia32_pdep_si(__X, __Y);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_pext_u32(unsigned int __X, unsigned int __Y)
-{
- return __builtin_ia32_pext_si(__X, __Y);
-}
-
-#ifdef __x86_64__
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bzhi_u64(unsigned long long __X, unsigned long long __Y)
-{
- return __builtin_ia32_bzhi_di(__X, __Y);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pdep_u64(unsigned long long __X, unsigned long long __Y)
-{
- return __builtin_ia32_pdep_di(__X, __Y);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_pext_u64(unsigned long long __X, unsigned long long __Y)
-{
- return __builtin_ia32_pext_di(__X, __Y);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_mulx_u64 (unsigned long long __X, unsigned long long __Y,
- unsigned long long *__P)
-{
- unsigned __int128 __res = (unsigned __int128) __X * __Y;
- *__P = (unsigned long long) (__res >> 64);
- return (unsigned long long) __res;
-}
-
-#else /* !__x86_64__ */
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
-{
- unsigned long long __res = (unsigned long long) __X * __Y;
- *__P = (unsigned int) (__res >> 32);
- return (unsigned int) __res;
-}
-
-#endif /* !__x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __BMI2INTRIN_H */
+++ /dev/null
-/*===---- bmiintrin.h - BMI intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __BMIINTRIN_H
-#define __BMIINTRIN_H
-
-/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
- instruction behaves as BSF on non-BMI targets, there is code that expects
- to use it as a potentially faster version of BSF. */
-#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-
-#define _tzcnt_u16(a) (__tzcnt_u16((a)))
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-/// An unsigned 16-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of trailing zero
-/// bits in the operand.
-static __inline__ unsigned short __RELAXED_FN_ATTRS
-__tzcnt_u16(unsigned short __X)
-{
- return __builtin_ia32_tzcnt_u16(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-/// An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of trailing zero
-/// bits in the operand.
-static __inline__ unsigned int __RELAXED_FN_ATTRS
-__tzcnt_u32(unsigned int __X)
-{
- return __builtin_ia32_tzcnt_u32(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-/// An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero bits in
-/// the operand.
-static __inline__ int __RELAXED_FN_ATTRS
-_mm_tzcnt_32(unsigned int __X)
-{
- return __builtin_ia32_tzcnt_u32(__X);
-}
-
-#define _tzcnt_u32(a) (__tzcnt_u32((a)))
-
-#ifdef __x86_64__
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of trailing zero
-/// bits in the operand.
-static __inline__ unsigned long long __RELAXED_FN_ATTRS
-__tzcnt_u64(unsigned long long __X)
-{
- return __builtin_ia32_tzcnt_u64(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An 64-bit integer containing the number of trailing zero bits in
-/// the operand.
-static __inline__ long long __RELAXED_FN_ATTRS
-_mm_tzcnt_64(unsigned long long __X)
-{
- return __builtin_ia32_tzcnt_u64(__X);
-}
-
-#define _tzcnt_u64(a) (__tzcnt_u64((a)))
-
-#endif /* __x86_64__ */
-
-#undef __RELAXED_FN_ATTRS
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__BMI__)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
-
-#define _andn_u32(a, b) (__andn_u32((a), (b)))
-
-/* _bextr_u32 != __bextr_u32 */
-#define _blsi_u32(a) (__blsi_u32((a)))
-
-#define _blsmsk_u32(a) (__blsmsk_u32((a)))
-
-#define _blsr_u32(a) (__blsr_u32((a)))
-
-/// Performs a bitwise AND of the second operand with the one's
-/// complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
-///
-/// \param __X
-/// An unsigned integer containing one of the operands.
-/// \param __Y
-/// An unsigned integer containing one of the operands.
-/// \returns An unsigned integer containing the bitwise AND of the second
-/// operand with the one's complement of the first operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__andn_u32(unsigned int __X, unsigned int __Y)
-{
- return ~__X & __Y;
-}
-
-/* AMD-specified, double-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-/// in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-/// An unsigned integer whose bits are to be extracted.
-/// \param __Y
-/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
-/// specify the index of the least significant bit. Bits [15:8] specify the
-/// number of bits to be extracted.
-/// \returns An unsigned integer whose least significant bits contain the
-/// extracted bits.
-/// \see _bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__bextr_u32(unsigned int __X, unsigned int __Y)
-{
- return __builtin_ia32_bextr_u32(__X, __Y);
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-/// in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-/// An unsigned integer whose bits are to be extracted.
-/// \param __Y
-/// An unsigned integer used to specify the index of the least significant
-/// bit for the bits to be extracted. Bits [7:0] specify the index.
-/// \param __Z
-/// An unsigned integer used to specify the number of bits to be extracted.
-/// Bits [7:0] specify the number of bits.
-/// \returns An unsigned integer whose least significant bits contain the
-/// extracted bits.
-/// \see __bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
-{
- return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR2 */
-/// Extracts the specified bits from the first operand and returns them
-/// in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-/// An unsigned integer whose bits are to be extracted.
-/// \param __Y
-/// An unsigned integer used to specify which bits are extracted. Bits [7:0]
-/// specify the index of the least significant bit. Bits [15:8] specify the
-/// number of bits to be extracted.
-/// \returns An unsigned integer whose least significant bits contain the
-/// extracted bits.
-/// \see __bextr_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_bextr2_u32(unsigned int __X, unsigned int __Y) {
- return __builtin_ia32_bextr_u32(__X, __Y);
-}
-
-/// Clears all bits in the source except for the least significant bit
-/// containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
-///
-/// \param __X
-/// An unsigned integer whose bits are to be cleared.
-/// \returns An unsigned integer containing the result of clearing the bits from
-/// the source operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsi_u32(unsigned int __X)
-{
- return __X & -__X;
-}
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-/// including the least significant bit that is set to 1 in the source
-/// operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
-///
-/// \param __X
-/// An unsigned integer used to create the mask.
-/// \returns An unsigned integer containing the newly created mask.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsmsk_u32(unsigned int __X)
-{
- return __X ^ (__X - 1);
-}
-
-/// Clears the least significant bit that is set to 1 in the source
-/// operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
-///
-/// \param __X
-/// An unsigned integer containing the operand to be cleared.
-/// \returns An unsigned integer containing the result of clearing the source
-/// operand.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsr_u32(unsigned int __X)
-{
- return __X & (__X - 1);
-}
-
-#ifdef __x86_64__
-
-#define _andn_u64(a, b) (__andn_u64((a), (b)))
-
-/* _bextr_u64 != __bextr_u64 */
-#define _blsi_u64(a) (__blsi_u64((a)))
-
-#define _blsmsk_u64(a) (__blsmsk_u64((a)))
-
-#define _blsr_u64(a) (__blsr_u64((a)))
-
-/// Performs a bitwise AND of the second operand with the one's
-/// complement of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer containing one of the operands.
-/// \param __Y
-/// An unsigned 64-bit integer containing one of the operands.
-/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
-/// operand with the one's complement of the first operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__andn_u64 (unsigned long long __X, unsigned long long __Y)
-{
- return ~__X & __Y;
-}
-
-/* AMD-specified, double-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-/// in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
-/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
-/// the number of bits to be extracted.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-/// extracted bits.
-/// \see _bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__bextr_u64(unsigned long long __X, unsigned long long __Y)
-{
- return __builtin_ia32_bextr_u64(__X, __Y);
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR */
-/// Extracts the specified bits from the first operand and returns them
-/// in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-/// An unsigned integer used to specify the index of the least significant
-/// bit for the bits to be extracted. Bits [7:0] specify the index.
-/// \param __Z
-/// An unsigned integer used to specify the number of bits to be extracted.
-/// Bits [7:0] specify the number of bits.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-/// extracted bits.
-/// \see __bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
-{
- return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
-}
-
-/* Intel-specified, single-leading-underscore version of BEXTR2 */
-/// Extracts the specified bits from the first operand and returns them
-/// in the least significant bits of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose bits are to be extracted.
-/// \param __Y
-/// An unsigned 64-bit integer used to specify which bits are extracted. Bits
-/// [7:0] specify the index of the least significant bit. Bits [15:8] specify
-/// the number of bits to be extracted.
-/// \returns An unsigned 64-bit integer whose least significant bits contain the
-/// extracted bits.
-/// \see __bextr_u64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_bextr2_u64(unsigned long long __X, unsigned long long __Y) {
- return __builtin_ia32_bextr_u64(__X, __Y);
-}
-
-/// Clears all bits in the source except for the least significant bit
-/// containing a value of 1 and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose bits are to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-/// bits from the source operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsi_u64(unsigned long long __X)
-{
- return __X & -__X;
-}
-
-/// Creates a mask whose bits are set to 1, using bit 0 up to and
-/// including the least significant bit that is set to 1 in the source
-/// operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer used to create the mask.
-/// \returns An unsigned 64-bit integer containing the newly created mask.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsmsk_u64(unsigned long long __X)
-{
- return __X ^ (__X - 1);
-}
-
-/// Clears the least significant bit that is set to 1 in the source
-/// operand and returns the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer containing the operand to be cleared.
-/// \returns An unsigned 64-bit integer containing the result of clearing the
-/// source operand.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsr_u64(unsigned long long __X)
-{
- return __X & (__X - 1);
-}
-
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
- || defined(__BMI__) */
-
-#endif /* __BMIINTRIN_H */
+++ /dev/null
-/*===---- cetintrin.h - CET intrinsic --------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <cetintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CETINTRIN_H
-#define __CETINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("shstk")))
-
-static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) {
- __builtin_ia32_incsspd(__a);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) {
- __builtin_ia32_incsspq(__a);
-}
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
- __builtin_ia32_incsspq(__a);
-}
-#else /* __x86_64__ */
-static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) {
- __builtin_ia32_incsspd((int)__a);
-}
-#endif /* __x86_64__ */
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) {
- return __builtin_ia32_rdsspd(__a);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() {
- unsigned int t;
- return __builtin_ia32_rdsspd(t);
-}
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) {
- return __builtin_ia32_rdsspq(__a);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() {
- unsigned long long t;
- return __builtin_ia32_rdsspq(t);
-}
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) {
- return __builtin_ia32_rdsspq(0);
-}
-#else /* __x86_64__ */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) {
- return __builtin_ia32_rdsspd(0);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() {
- __builtin_ia32_saveprevssp();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) {
- __builtin_ia32_rstorssp(__p);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) {
- __builtin_ia32_wrssd(__a, __p);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) {
- __builtin_ia32_wrssq(__a, __p);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) {
- __builtin_ia32_wrussd(__a, __p);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) {
- __builtin_ia32_wrussq(__a, __p);
-}
-#endif /* __x86_64__ */
-
-static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() {
- __builtin_ia32_setssbsy();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) {
- __builtin_ia32_clrssbsy(__p);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CETINTRIN_H */
+++ /dev/null
-/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <cldemoteintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __CLDEMOTEINTRIN_H
-#define __CLDEMOTEINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("cldemote")))
-
-/// Hint to hardware that the cache line that contains \p __P should be demoted
-/// from the cache closest to the processor core to a level more distant from
-/// the processor core.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CLDEMOTE </c> instruction.
-static __inline__ void __DEFAULT_FN_ATTRS
-_cldemote(const void * __P) {
- __builtin_ia32_cldemote(__P);
-}
-
-#define _mm_cldemote(p) _cldemote(p)
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <clflushoptintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CLFLUSHOPTINTRIN_H
-#define __CLFLUSHOPTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clflushopt(void const * __m) {
- __builtin_ia32_clflushopt(__m);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <clwbintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __CLWBINTRIN_H
-#define __CLWBINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb")))
-
-/// Writes back to memory the cache line (if modified) that contains the
-/// linear address specified in \a __p from any level of the cache hierarchy in
-/// the cache coherence domain
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> CLWB </c> instruction.
-///
-/// \param __p
-/// A pointer to the memory location used to identify the cache line to be
-/// written back.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clwb(void const *__p) {
- __builtin_ia32_clwb(__p);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===----------------------- clzerointrin.h - CLZERO ----------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <clzerointrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __CLZEROINTRIN_H
-#define __CLZEROINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("clzero")))
-
-/// Loads the cache line address and zero's out the cacheline
-///
-/// \headerfile <clzerointrin.h>
-///
-/// This intrinsic corresponds to the <c> CLZERO </c> instruction.
-///
-/// \param __line
-/// A pointer to a cacheline which needs to be zeroed out.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_clzero (void * __line)
-{
- __builtin_ia32_clzero ((void *)__line);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CLZEROINTRIN_H */
+++ /dev/null
-/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __CRC32INTRIN_H
-#define __CRC32INTRIN_H
-
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-/// unsigned char operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32B </c> instruction.
-///
-/// \param __C
-/// An unsigned integer operand to add to the CRC-32C checksum of operand
-/// \a __D.
-/// \param __D
-/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-/// operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u8(unsigned int __C, unsigned char __D)
-{
- return __builtin_ia32_crc32qi(__C, __D);
-}
-
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-/// unsigned short operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32W </c> instruction.
-///
-/// \param __C
-/// An unsigned integer operand to add to the CRC-32C checksum of operand
-/// \a __D.
-/// \param __D
-/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-/// operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u16(unsigned int __C, unsigned short __D)
-{
- return __builtin_ia32_crc32hi(__C, __D);
-}
-
-/// Adds the first unsigned integer operand to the CRC-32C checksum of
-/// the second unsigned integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32L </c> instruction.
-///
-/// \param __C
-/// An unsigned integer operand to add to the CRC-32C checksum of operand
-/// \a __D.
-/// \param __D
-/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-/// operand \a __D.
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_crc32_u32(unsigned int __C, unsigned int __D)
-{
- return __builtin_ia32_crc32si(__C, __D);
-}
-
-#ifdef __x86_64__
-/// Adds the unsigned integer operand to the CRC-32C checksum of the
-/// unsigned 64-bit integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CRC32Q </c> instruction.
-///
-/// \param __C
-/// An unsigned integer operand to add to the CRC-32C checksum of operand
-/// \a __D.
-/// \param __D
-/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
-/// \returns The result of adding operand \a __C to the CRC-32C checksum of
-/// operand \a __D.
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_mm_crc32_u64(unsigned long long __C, unsigned long long __D)
-{
- return __builtin_ia32_crc32di(__C, __D);
-}
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __CRC32INTRIN_H */
+++ /dev/null
-/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __EMMINTRIN_H
-#define __EMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <xmmintrin.h>
-
-typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
-typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
-
-typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
-typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
-
-/* Type defines. */
-typedef double __v2df __attribute__ ((__vector_size__ (16)));
-typedef long long __v2di __attribute__ ((__vector_size__ (16)));
-typedef short __v8hi __attribute__((__vector_size__(16)));
-typedef char __v16qi __attribute__((__vector_size__(16)));
-
-/* Unsigned types */
-typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
-typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
-typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
-
-/* We need an explicitly signed variant for char. Note that this shouldn't
- * appear in the interface though. */
-typedef signed char __v16qs __attribute__((__vector_size__(16)));
-
-#if (__clang_major__ > 15)
-#ifdef __SSE2__
-/* Both _Float16 and __bf16 require SSE2 being enabled. */
-typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
-
-typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
-typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
-#endif
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64)))
-
-/// Adds lower double-precision values in both operands and returns the
-/// sum in the lower 64 bits of the result. The upper 64 bits of the result
-/// are copied from the upper double-precision value of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
-/// from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_add_sd(__m128d __a, __m128d __b)
-{
- __a[0] += __b[0];
- return __a;
-}
-
-/// Adds two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the sums of both
-/// operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_add_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)((__v2df)__a + (__v2df)__b);
-}
-
-/// Subtracts the lower double-precision value of the second operand
-/// from the lower double-precision value of the first operand and returns
-/// the difference in the lower 64 bits of the result. The upper 64 bits of
-/// the result are copied from the upper double-precision value of the first
-/// operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the minuend.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing the subtrahend.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// difference of the lower 64 bits of both operands. The upper 64 bits are
-/// copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_sub_sd(__m128d __a, __m128d __b)
-{
- __a[0] -= __b[0];
- return __a;
-}
-
-/// Subtracts two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the minuend.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing the subtrahend.
-/// \returns A 128-bit vector of [2 x double] containing the differences between
-/// both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_sub_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)((__v2df)__a - (__v2df)__b);
-}
-
-/// Multiplies lower double-precision values in both operands and returns
-/// the product in the lower 64 bits of the result. The upper 64 bits of the
-/// result are copied from the upper double-precision value of the first
-/// operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// product of the lower 64 bits of both operands. The upper 64 bits are
-/// copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mul_sd(__m128d __a, __m128d __b)
-{
- __a[0] *= __b[0];
- return __a;
-}
-
-/// Multiplies two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the operands.
-/// \returns A 128-bit vector of [2 x double] containing the products of both
-/// operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_mul_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)((__v2df)__a * (__v2df)__b);
-}
-
-/// Divides the lower double-precision value of the first operand by the
-/// lower double-precision value of the second operand and returns the
-/// quotient in the lower 64 bits of the result. The upper 64 bits of the
-/// result are copied from the upper double-precision value of the first
-/// operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the dividend.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing divisor.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// quotient of the lower 64 bits of both operands. The upper 64 bits are
-/// copied from the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_div_sd(__m128d __a, __m128d __b)
-{
- __a[0] /= __b[0];
- return __a;
-}
-
-/// Performs an element-by-element division of two 128-bit vectors of
-/// [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the dividend.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing the divisor.
-/// \returns A 128-bit vector of [2 x double] containing the quotients of both
-/// operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_div_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)((__v2df)__a / (__v2df)__b);
-}
-
-/// Calculates the square root of the lower double-precision value of
-/// the second operand and returns it in the lower 64 bits of the result.
-/// The upper 64 bits of the result are copied from the upper
-/// double-precision value of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the operands. The
-/// upper 64 bits of this operand are copied to the upper 64 bits of the
-/// result.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the operands. The
-/// square root is calculated using the lower 64 bits of this operand.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// square root of the lower 64 bits of operand \a __b, and whose upper 64
-/// bits are copied from the upper 64 bits of operand \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_sqrt_sd(__m128d __a, __m128d __b)
-{
- __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
- return __extension__ (__m128d) { __c[0], __a[1] };
-}
-
-/// Calculates the square root of the each of two values stored in a
-/// 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [2 x double] containing the square roots of the
-/// values in the operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_sqrt_pd(__m128d __a)
-{
- return __builtin_ia32_sqrtpd((__v2df)__a);
-}
-
-/// Compares lower 64-bit double-precision values of both operands, and
-/// returns the lesser of the pair of values in the lower 64-bits of the
-/// result. The upper 64 bits of the result are copied from the upper
-/// double-precision value of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the operands. The
-/// lower 64 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the operands. The
-/// lower 64 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// minimum value between both operands. The upper 64 bits are copied from
-/// the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_min_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Performs element-by-element comparison of the two 128-bit vectors of
-/// [2 x double] and returns the vector containing the lesser of each pair of
-/// values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the operands.
-/// \returns A 128-bit vector of [2 x double] containing the minimum values
-/// between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_min_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares lower 64-bit double-precision values of both operands, and
-/// returns the greater of the pair of values in the lower 64-bits of the
-/// result. The upper 64 bits of the result are copied from the upper
-/// double-precision value of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the operands. The
-/// lower 64 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the operands. The
-/// lower 64 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// maximum value between both operands. The upper 64 bits are copied from
-/// the upper 64 bits of the first source operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_max_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Performs element-by-element comparison of the two 128-bit vectors of
-/// [2 x double] and returns the vector containing the greater of each pair
-/// of values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the operands.
-/// \returns A 128-bit vector of [2 x double] containing the maximum values
-/// between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_max_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
-/// values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_and_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)((__v2du)__a & (__v2du)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
-/// the one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the left source operand. The
-/// one's complement of this value is used in the bitwise AND.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing the right source operand.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
-/// values in the second operand and the one's complement of the first
-/// operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_andnot_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)(~(__v2du)__a & (__v2du)__b);
-}
-
-/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
-/// values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_or_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)((__v2du)__a | (__v2du)__b);
-}
-
-/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
-/// values between both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_xor_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)((__v2du)__a ^ (__v2du)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
-/// for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpeq_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are less than those in the second operand. Each comparison
-/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmplt_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are less than or equal to those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmple_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are greater than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpgt_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are greater than or equal to those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpge_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are ordered with respect to those in the second operand.
-///
-/// A pair of double-precision values are "ordered" with respect to each
-/// other if neither value is a NaN. Each comparison yields 0x0 for false,
-/// 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpord_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are unordered with respect to those in the second operand.
-///
-/// A pair of double-precision values are "unordered" with respect to each
-/// other if one or both values are NaN. Each comparison yields 0x0 for
-/// false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpunord_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are unequal to those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpneq_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are not less than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpnlt_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are not less than or equal to those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpnle_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are not greater than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpngt_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares each of the corresponding double-precision values of the
-/// 128-bit vectors of [2 x double] to determine if the values in the first
-/// operand are not greater than or equal to those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \param __b
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector containing the comparison results.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpnge_pd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] for equality.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpeq_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is less than the corresponding value in
-/// the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmplt_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is less than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmple_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is greater than the corresponding value
-/// in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpgt_sd(__m128d __a, __m128d __b)
-{
- __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
- return __extension__ (__m128d) { __c[0], __a[1] };
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is greater than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpge_sd(__m128d __a, __m128d __b)
-{
- __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
- return __extension__ (__m128d) { __c[0], __a[1] };
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is "ordered" with respect to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
-/// of double-precision values are "ordered" with respect to each other if
-/// neither value is a NaN.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpord_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is "unordered" with respect to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
-/// of double-precision values are "unordered" with respect to each other if
-/// one or both values are NaN.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpunord_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is unequal to the corresponding value in
-/// the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpneq_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is not less than the corresponding
-/// value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpnlt_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is not less than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpnle_sd(__m128d __a, __m128d __b)
-{
- return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is not greater than the corresponding
-/// value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpngt_sd(__m128d __a, __m128d __b)
-{
- __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
- return __extension__ (__m128d) { __c[0], __a[1] };
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is not greater than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns A 128-bit vector. The lower 64 bits contains the comparison
-/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cmpnge_sd(__m128d __a, __m128d __b)
-{
- __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
- return __extension__ (__m128d) { __c[0], __a[1] };
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] for equality.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comieq_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is less than the corresponding value in
-/// the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comilt_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is less than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comile_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is greater than the corresponding value
-/// in the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comigt_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is greater than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comige_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is unequal to the corresponding value in
-/// the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two
-/// lower double-precision values is NaN, 1 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 1 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comineq_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] for equality. The
-/// comparison yields 0 for false, 1 for true.
-///
-/// If either of the two lower double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomieq_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is less than the corresponding value in
-/// the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two lower
-/// double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomilt_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is less than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two lower
-/// double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomile_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is greater than the corresponding value
-/// in the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two lower
-/// double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomigt_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is greater than or equal to the
-/// corresponding value in the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower double-precision values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomige_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
-}
-
-/// Compares the lower double-precision floating-point values in each of
-/// the two 128-bit floating-point vectors of [2 x double] to determine if
-/// the value in the first parameter is unequal to the corresponding value in
-/// the second parameter.
-///
-/// The comparison yields 0 for false, 1 for true. If either of the two lower
-/// double-precision values is NaN, 1 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __b.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision value is
-/// compared to the lower double-precision value of \a __a.
-/// \returns An integer containing the comparison result. If either of the two
-/// lower double-precision values is NaN, 1 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomineq_sd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
-}
-
-/// Converts the two double-precision floating-point elements of a
-/// 128-bit vector of [2 x double] into two single-precision floating-point
-/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
-/// The upper 64 bits of the result vector are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-/// converted values. The upper 64 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtpd_ps(__m128d __a)
-{
- return __builtin_ia32_cvtpd2ps((__v2df)__a);
-}
-
-/// Converts the lower two single-precision floating-point elements of a
-/// 128-bit vector of [4 x float] into two double-precision floating-point
-/// values, returned in a 128-bit vector of [2 x double]. The upper two
-/// elements of the input vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower two single-precision
-/// floating-point elements are converted to double-precision values. The
-/// upper two elements are unused.
-/// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cvtps_pd(__m128 __a)
-{
- return (__m128d) __builtin_convertvector(
- __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
-}
-
-/// Converts the lower two integer elements of a 128-bit vector of
-/// [4 x i32] into two double-precision floating-point values, returned in a
-/// 128-bit vector of [2 x double].
-///
-/// The upper two elements of the input vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
-/// converted to double-precision values.
-///
-/// The upper two elements are unused.
-/// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cvtepi32_pd(__m128i __a)
-{
- return (__m128d) __builtin_convertvector(
- __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
-}
-
-/// Converts the two double-precision floating-point elements of a
-/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
-/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
-/// 64 bits of the result vector are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
-/// converted values. The upper 64 bits are set to zero.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtpd_epi32(__m128d __a)
-{
- return __builtin_ia32_cvtpd2dq((__v2df)__a);
-}
-
-/// Converts the low-order element of a 128-bit vector of [2 x double]
-/// into a 32-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-/// conversion.
-/// \returns A 32-bit signed integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtsd_si32(__m128d __a)
-{
- return __builtin_ia32_cvtsd2si((__v2df)__a);
-}
-
-/// Converts the lower double-precision floating-point element of a
-/// 128-bit vector of [2 x double], in the second parameter, into a
-/// single-precision floating-point value, returned in the lower 32 bits of a
-/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
-/// copied from the upper 96 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
-/// copied to the upper 96 bits of the result.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower double-precision
-/// floating-point element is used in the conversion.
-/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
-/// converted value from the second parameter. The upper 96 bits are copied
-/// from the upper 96 bits of the first parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsd_ss(__m128 __a, __m128d __b)
-{
- return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
-}
-
-/// Converts a 32-bit signed integer value, in the second parameter, into
-/// a double-precision floating-point value, returned in the lower 64 bits of
-/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
-/// are copied from the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
-/// copied to the upper 64 bits of the result.
-/// \param __b
-/// A 32-bit signed integer containing the value to be converted.
-/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
-/// converted value from the second parameter. The upper 64 bits are copied
-/// from the upper 64 bits of the first parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cvtsi32_sd(__m128d __a, int __b)
-{
- __a[0] = __b;
- return __a;
-}
-
-/// Converts the lower single-precision floating-point element of a
-/// 128-bit vector of [4 x float], in the second parameter, into a
-/// double-precision floating-point value, returned in the lower 64 bits of
-/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
-/// are copied from the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
-/// copied to the upper 64 bits of the result.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower single-precision
-/// floating-point element is used in the conversion.
-/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
-/// converted value from the second parameter. The upper 64 bits are copied
-/// from the upper 64 bits of the first parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cvtss_sd(__m128d __a, __m128 __b)
-{
- __a[0] = __b[0];
- return __a;
-}
-
-/// Converts the two double-precision floating-point elements of a
-/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
-/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
-///
-/// If the result of either conversion is inexact, the result is truncated
-/// (rounded towards zero) regardless of the current MXCSR setting. The upper
-/// 64 bits of the result vector are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
-/// converted values. The upper 64 bits are set to zero.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvttpd_epi32(__m128d __a)
-{
- return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
-}
-
-/// Converts the low-order element of a [2 x double] vector into a 32-bit
-/// signed integer value, truncating the result when it is inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-/// conversion.
-/// \returns A 32-bit signed integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvttsd_si32(__m128d __a)
-{
- return __builtin_ia32_cvttsd2si((__v2df)__a);
-}
-
-/// Converts the two double-precision floating-point elements of a
-/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
-/// returned in a 64-bit vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpd_pi32(__m128d __a)
-{
- return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
-}
-
-/// Converts the two double-precision floating-point elements of a
-/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
-/// returned in a 64-bit vector of [2 x i32].
-///
-/// If the result of either conversion is inexact, the result is truncated
-/// (rounded towards zero) regardless of the current MXCSR setting.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double].
-/// \returns A 64-bit vector of [2 x i32] containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvttpd_pi32(__m128d __a)
-{
- return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
-}
-
-/// Converts the two signed 32-bit integer elements of a 64-bit vector of
-/// [2 x i32] into two double-precision floating-point values, returned in a
-/// 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
-///
-/// \param __a
-/// A 64-bit vector of [2 x i32].
-/// \returns A 128-bit vector of [2 x double] containing the converted values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi32_pd(__m64 __a)
-{
- return __builtin_ia32_cvtpi2pd((__v2si)__a);
-}
-
-/// Returns the low-order element of a 128-bit vector of [2 x double] as
-/// a double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
-/// \returns A double-precision floating-point value copied from the lower 64
-/// bits of \a __a.
-static __inline__ double __DEFAULT_FN_ATTRS
-_mm_cvtsd_f64(__m128d __a)
-{
- return __a[0];
-}
-
-/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location has to be 16-byte aligned.
-/// \returns A 128-bit vector of [2 x double] containing the loaded values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_load_pd(double const *__dp)
-{
- return *(const __m128d*)__dp;
-}
-
-/// Loads a double-precision floating-point value from a specified memory
-/// location and duplicates it to both vector elements of a 128-bit vector of
-/// [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
-///
-/// \param __dp
-/// A pointer to a memory location containing a double-precision value.
-/// \returns A 128-bit vector of [2 x double] containing the loaded and
-/// duplicated values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_load1_pd(double const *__dp)
-{
- struct __mm_load1_pd_struct {
- double __u;
- } __attribute__((__packed__, __may_alias__));
- double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u;
- return __extension__ (__m128d){ __u, __u };
-}
-
-#define _mm_load_pd1(dp) _mm_load1_pd(dp)
-
-/// Loads two double-precision values, in reverse order, from an aligned
-/// memory location into a 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
-/// needed shuffling instructions. In AVX mode, the shuffling may be combined
-/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
-///
-/// \param __dp
-/// A 16-byte aligned pointer to an array of double-precision values to be
-/// loaded in reverse order.
-/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
-/// values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_loadr_pd(double const *__dp)
-{
- __m128d __u = *(const __m128d*)__dp;
- return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
-}
-
-/// Loads a 128-bit floating-point vector of [2 x double] from an
-/// unaligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the loaded values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_loadu_pd(double const *__dp)
-{
- struct __loadu_pd {
- __m128d_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_pd*)__dp)->__v;
-}
-
-/// Loads a 64-bit integer value to the low element of a 128-bit integer
-/// vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __a
-/// A pointer to a 64-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadu_si64(void const *__a)
-{
- struct __loadu_si64 {
- long long __v;
- } __attribute__((__packed__, __may_alias__));
- long long __u = ((const struct __loadu_si64*)__a)->__v;
- return __extension__ (__m128i)(__v2di){__u, 0LL};
-}
-
-/// Loads a 32-bit integer value to the low element of a 128-bit integer
-/// vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __a
-/// A pointer to a 32-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadu_si32(void const *__a)
-{
- struct __loadu_si32 {
- int __v;
- } __attribute__((__packed__, __may_alias__));
- int __u = ((const struct __loadu_si32*)__a)->__v;
- return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
-}
-
-/// Loads a 16-bit integer value to the low element of a 128-bit integer
-/// vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __a
-/// A pointer to a 16-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadu_si16(void const *__a)
-{
- struct __loadu_si16 {
- short __v;
- } __attribute__((__packed__, __may_alias__));
- short __u = ((const struct __loadu_si16*)__a)->__v;
- return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
-}
-
-/// Loads a 64-bit double-precision value to the low element of a
-/// 128-bit integer vector and clears the upper element.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a memory location containing a double-precision value.
-/// The address of the memory location does not have to be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the loaded value.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_load_sd(double const *__dp)
-{
- struct __mm_load_sd_struct {
- double __u;
- } __attribute__((__packed__, __may_alias__));
- double __u = ((const struct __mm_load_sd_struct*)__dp)->__u;
- return __extension__ (__m128d){ __u, 0 };
-}
-
-/// Loads a double-precision value into the high-order bits of a 128-bit
-/// vector of [2 x double]. The low-order bits are copied from the low-order
-/// bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. \n
-/// Bits [63:0] are written to bits [63:0] of the result.
-/// \param __dp
-/// A pointer to a 64-bit memory location containing a double-precision
-/// floating-point value that is loaded. The loaded value is written to bits
-/// [127:64] of the result. The address of the memory location does not have
-/// to be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the moved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_loadh_pd(__m128d __a, double const *__dp)
-{
- struct __mm_loadh_pd_struct {
- double __u;
- } __attribute__((__packed__, __may_alias__));
- double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u;
- return __extension__ (__m128d){ __a[0], __u };
-}
-
-/// Loads a double-precision value into the low-order bits of a 128-bit
-/// vector of [2 x double]. The high-order bits are copied from the
-/// high-order bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. \n
-/// Bits [127:64] are written to bits [127:64] of the result.
-/// \param __dp
-/// A pointer to a 64-bit memory location containing a double-precision
-/// floating-point value that is loaded. The loaded value is written to bits
-/// [63:0] of the result. The address of the memory location does not have to
-/// be aligned.
-/// \returns A 128-bit vector of [2 x double] containing the moved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_loadl_pd(__m128d __a, double const *__dp)
-{
- struct __mm_loadl_pd_struct {
- double __u;
- } __attribute__((__packed__, __may_alias__));
- double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u;
- return __extension__ (__m128d){ __u, __a[1] };
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double] with
-/// unspecified content. This could be used as an argument to another
-/// intrinsic function where the argument is required but the value is not
-/// actually used.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
-/// content.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_undefined_pd(void)
-{
- return (__m128d)__builtin_ia32_undef128();
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
-/// 64 bits of the vector are initialized with the specified double-precision
-/// floating-point value. The upper 64 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __w
-/// A double-precision floating-point value used to initialize the lower 64
-/// bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
-/// lower 64 bits contain the value of the parameter. The upper 64 bits are
-/// set to zero.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_set_sd(double __w)
-{
- return __extension__ (__m128d){ __w, 0 };
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double], with each
-/// of the two double-precision floating-point vector elements set to the
-/// specified double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
-///
-/// \param __w
-/// A double-precision floating-point value used to initialize each vector
-/// element of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_set1_pd(double __w)
-{
- return __extension__ (__m128d){ __w, __w };
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double], with each
-/// of the two double-precision floating-point vector elements set to the
-/// specified double-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
-///
-/// \param __w
-/// A double-precision floating-point value used to initialize each vector
-/// element of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_set_pd1(double __w)
-{
- return _mm_set1_pd(__w);
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]
-/// initialized with the specified double-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __w
-/// A double-precision floating-point value used to initialize the upper 64
-/// bits of the result.
-/// \param __x
-/// A double-precision floating-point value used to initialize the lower 64
-/// bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_set_pd(double __w, double __x)
-{
- return __extension__ (__m128d){ __x, __w };
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double],
-/// initialized in reverse order with the specified double-precision
-/// floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __w
-/// A double-precision floating-point value used to initialize the lower 64
-/// bits of the result.
-/// \param __x
-/// A double-precision floating-point value used to initialize the upper 64
-/// bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [2 x double].
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_setr_pd(double __w, double __x)
-{
- return __extension__ (__m128d){ __w, __x };
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]
-/// initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
-///
-/// \returns An initialized 128-bit floating-point vector of [2 x double] with
-/// all elements set to zero.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_setzero_pd(void)
-{
- return __extension__ (__m128d){ 0, 0 };
-}
-
-/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
-/// 64 bits are set to the lower 64 bits of the second parameter. The upper
-/// 64 bits are set to the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
-/// upper 64 bits of the result.
-/// \param __b
-/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
-/// lower 64 bits of the result.
-/// \returns A 128-bit vector of [2 x double] containing the moved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_move_sd(__m128d __a, __m128d __b)
-{
- __a[0] = __b[0];
- return __a;
-}
-
-/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a 64-bit memory location.
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_sd(double *__dp, __m128d __a)
-{
- struct __mm_store_sd_struct {
- double __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
-}
-
-/// Moves packed double-precision values from a 128-bit vector of
-/// [2 x double] to a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
-///
-/// \param __dp
-/// A pointer to an aligned memory location that can store two
-/// double-precision values.
-/// \param __a
-/// A packed 128-bit vector of [2 x double] containing the values to be
-/// moved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_pd(double *__dp, __m128d __a)
-{
- *(__m128d*)__dp = __a;
-}
-
-/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
-/// the upper and lower 64 bits of a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the
-/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
-///
-/// \param __dp
-/// A pointer to a memory location that can store two double-precision
-/// values.
-/// \param __a
-/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
-/// of the values in \a __dp.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store1_pd(double *__dp, __m128d __a)
-{
- __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
- _mm_store_pd(__dp, __a);
-}
-
-/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
-/// the upper and lower 64 bits of a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the
-/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
-///
-/// \param __dp
-/// A pointer to a memory location that can store two double-precision
-/// values.
-/// \param __a
-/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
-/// of the values in \a __dp.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_pd1(double *__dp, __m128d __a)
-{
- _mm_store1_pd(__dp, __a);
-}
-
-/// Stores a 128-bit vector of [2 x double] into an unaligned memory
-/// location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_pd(double *__dp, __m128d __a)
-{
- struct __storeu_pd {
- __m128d_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_pd*)__dp)->__v = __a;
-}
-
-/// Stores two double-precision values, in reverse order, from a 128-bit
-/// vector of [2 x double] to a 16-byte aligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to a shuffling instruction followed by a
-/// <c> VMOVAPD / MOVAPD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a 16-byte aligned memory location that can store two
-/// double-precision values.
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the values to be reversed and
-/// stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storer_pd(double *__dp, __m128d __a)
-{
- __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
- *(__m128d *)__dp = __a;
-}
-
-/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a 64-bit memory location.
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeh_pd(double *__dp, __m128d __a)
-{
- struct __mm_storeh_pd_struct {
- double __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
-}
-
-/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
-///
-/// \param __dp
-/// A pointer to a 64-bit memory location.
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storel_pd(double *__dp, __m128d __a)
-{
- struct __mm_storeh_pd_struct {
- double __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
-/// saving the lower 8 bits of each sum in the corresponding element of a
-/// 128-bit result vector of [16 x i8].
-///
-/// The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [16 x i8].
-/// \param __b
-/// A 128-bit vector of [16 x i8].
-/// \returns A 128-bit vector of [16 x i8] containing the sums of both
-/// parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_add_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v16qu)__a + (__v16qu)__b);
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
-/// saving the lower 16 bits of each sum in the corresponding element of a
-/// 128-bit result vector of [8 x i16].
-///
-/// The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16].
-/// \param __b
-/// A 128-bit vector of [8 x i16].
-/// \returns A 128-bit vector of [8 x i16] containing the sums of both
-/// parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_add_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v8hu)__a + (__v8hu)__b);
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
-/// saving the lower 32 bits of each sum in the corresponding element of a
-/// 128-bit result vector of [4 x i32].
-///
-/// The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x i32].
-/// \param __b
-/// A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [4 x i32] containing the sums of both
-/// parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_add_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v4su)__a + (__v4su)__b);
-}
-
-/// Adds two signed or unsigned 64-bit integer values, returning the
-/// lower 64 bits of the sum.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer.
-/// \param __b
-/// A 64-bit integer.
-/// \returns A 64-bit integer containing the sum of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_add_si64(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
-}
-
-/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
-/// saving the lower 64 bits of each sum in the corresponding element of a
-/// 128-bit result vector of [2 x i64].
-///
-/// The integer elements of both parameters can be either signed or unsigned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x i64].
-/// \param __b
-/// A 128-bit vector of [2 x i64].
-/// \returns A 128-bit vector of [2 x i64] containing the sums of both
-/// parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_add_epi64(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a + (__v2du)__b);
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-/// signed [16 x i8] vectors, saving each sum in the corresponding element of
-/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
-/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
-///
-/// \param __a
-/// A 128-bit signed [16 x i8] vector.
-/// \param __b
-/// A 128-bit signed [16 x i8] vector.
-/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
-/// both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_adds_epi8(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
-#else
- return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
-#endif
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-/// signed [8 x i16] vectors, saving each sum in the corresponding element of
-/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
-/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-/// 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
-///
-/// \param __a
-/// A 128-bit signed [8 x i16] vector.
-/// \param __b
-/// A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
-/// both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_adds_epi16(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
-#else
- return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
-#endif
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
-/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
-/// are saturated to 0xFF. Negative sums are saturated to 0x00.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
-///
-/// \param __a
-/// A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-/// A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
-/// of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_adds_epu8(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
-#else
- return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
-#endif
-}
-
-/// Adds, with saturation, the corresponding elements of two 128-bit
-/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
-/// of a 128-bit result vector of [8 x i16]. Positive sums greater than
-/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
-///
-/// \param __a
-/// A 128-bit unsigned [8 x i16] vector.
-/// \param __b
-/// A 128-bit unsigned [8 x i16] vector.
-/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
-/// of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_adds_epu16(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
-#else
- return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
-#endif
-}
-
-/// Computes the rounded averages of corresponding elements of two
-/// 128-bit unsigned [16 x i8] vectors, saving each result in the
-/// corresponding element of a 128-bit result vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
-///
-/// \param __a
-/// A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-/// A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
-/// averages of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_avg_epu8(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Computes the rounded averages of corresponding elements of two
-/// 128-bit unsigned [8 x i16] vectors, saving each result in the
-/// corresponding element of a 128-bit result vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
-///
-/// \param __a
-/// A 128-bit unsigned [8 x i16] vector.
-/// \param __b
-/// A 128-bit unsigned [8 x i16] vector.
-/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
-/// averages of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_avg_epu16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
-/// vectors, producing eight intermediate 32-bit signed integer products, and
-/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
-/// [4 x i32] vector.
-///
-/// For example, bits [15:0] of both parameters are multiplied producing a
-/// 32-bit product, bits [31:16] of both parameters are multiplied producing
-/// a 32-bit product, and the sum of those two products becomes bits [31:0]
-/// of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
-///
-/// \param __a
-/// A 128-bit signed [8 x i16] vector.
-/// \param __b
-/// A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
-/// of both parameters.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_madd_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Compares corresponding elements of two 128-bit signed [8 x i16]
-/// vectors, saving the greater value from each comparison in the
-/// corresponding element of a 128-bit result vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
-///
-/// \param __a
-/// A 128-bit signed [8 x i16] vector.
-/// \param __b
-/// A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
-/// each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_max_epi16(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
-#else
- return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
-#endif
-}
-
-/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
-/// vectors, saving the greater value from each comparison in the
-/// corresponding element of a 128-bit result vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
-///
-/// \param __a
-/// A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-/// A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
-/// each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_max_epu8(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
-#else
- return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
-#endif
-}
-
-/// Compares corresponding elements of two 128-bit signed [8 x i16]
-/// vectors, saving the smaller value from each comparison in the
-/// corresponding element of a 128-bit result vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
-///
-/// \param __a
-/// A 128-bit signed [8 x i16] vector.
-/// \param __b
-/// A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
-/// each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_min_epi16(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
-#else
- return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
-#endif
-}
-
-/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
-/// vectors, saving the smaller value from each comparison in the
-/// corresponding element of a 128-bit result vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
-///
-/// \param __a
-/// A 128-bit unsigned [16 x i8] vector.
-/// \param __b
-/// A 128-bit unsigned [16 x i8] vector.
-/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
-/// each comparison.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_min_epu8(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
-#else
- return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
-#endif
-}
-
-/// Multiplies the corresponding elements of two signed [8 x i16]
-/// vectors, saving the upper 16 bits of each 32-bit product in the
-/// corresponding element of a 128-bit signed [8 x i16] result vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
-///
-/// \param __a
-/// A 128-bit signed [8 x i16] vector.
-/// \param __b
-/// A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
-/// each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mulhi_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies the corresponding elements of two unsigned [8 x i16]
-/// vectors, saving the upper 16 bits of each 32-bit product in the
-/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
-///
-/// \param __a
-/// A 128-bit unsigned [8 x i16] vector.
-/// \param __b
-/// A 128-bit unsigned [8 x i16] vector.
-/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
-/// of each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mulhi_epu16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies the corresponding elements of two signed [8 x i16]
-/// vectors, saving the lower 16 bits of each 32-bit product in the
-/// corresponding element of a 128-bit signed [8 x i16] result vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
-///
-/// \param __a
-/// A 128-bit signed [8 x i16] vector.
-/// \param __b
-/// A 128-bit signed [8 x i16] vector.
-/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
-/// each of the eight 32-bit products.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mullo_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v8hu)__a * (__v8hu)__b);
-}
-
-/// Multiplies 32-bit unsigned integer values contained in the lower bits
-/// of the two 64-bit integer vectors and returns the 64-bit unsigned
-/// product.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer containing one of the source operands.
-/// \param __b
-/// A 64-bit integer containing one of the source operands.
-/// \returns A 64-bit integer vector containing the product of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_mul_su32(__m64 __a, __m64 __b)
-{
- return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
-}
-
-/// Multiplies 32-bit unsigned integer values contained in the lower
-/// bits of the corresponding elements of two [2 x i64] vectors, and returns
-/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
-///
-/// \param __a
-/// A [2 x i64] vector containing one of the source operands.
-/// \param __b
-/// A [2 x i64] vector containing one of the source operands.
-/// \returns A [2 x i64] vector containing the product of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mul_epu32(__m128i __a, __m128i __b)
-{
- return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
-}
-
-/// Computes the absolute differences of corresponding 8-bit integer
-/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
-/// separately sums the second 8 absolute differences. Packs these two
-/// unsigned 16-bit integer sums into the upper and lower elements of a
-/// [2 x i64] vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 128-bit integer vector containing one of the source operands.
-/// \returns A [2 x i64] vector containing the sums of the sets of absolute
-/// differences between both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sad_epu8(__m128i __a, __m128i __b)
-{
- return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Subtracts the corresponding 8-bit integer values in the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-/// in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sub_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v16qu)__a - (__v16qu)__b);
-}
-
-/// Subtracts the corresponding 16-bit integer values in the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-/// in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sub_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v8hu)__a - (__v8hu)__b);
-}
-
-/// Subtracts the corresponding 32-bit integer values in the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-/// in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sub_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v4su)__a - (__v4su)__b);
-}
-
-/// Subtracts signed or unsigned 64-bit integer values and writes the
-/// difference to the corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing the minuend.
-/// \param __b
-/// A 64-bit integer vector containing the subtrahend.
-/// \returns A 64-bit integer vector containing the difference of the values in
-/// the operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sub_si64(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
-}
-
-/// Subtracts the corresponding elements of two [2 x i64] vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-/// in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sub_epi64(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a - (__v2du)__b);
-}
-
-/// Subtracts corresponding 8-bit signed integer values in the input and
-/// returns the differences in the corresponding bytes in the destination.
-/// Differences greater than 0x7F are saturated to 0x7F, and differences less
-/// than 0x80 are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-/// in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_subs_epi8(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
-#else
- return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
-#endif
-}
-
-/// Subtracts corresponding 16-bit signed integer values in the input and
-/// returns the differences in the corresponding bytes in the destination.
-/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
-/// than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the differences of the values
-/// in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_subs_epi16(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
-#else
- return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
-#endif
-}
-
-/// Subtracts corresponding 8-bit unsigned integer values in the input
-/// and returns the differences in the corresponding bytes in the
-/// destination. Differences less than 0x00 are saturated to 0x00.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the unsigned integer
-/// differences of the values in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_subs_epu8(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
-#else
- return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
-#endif
-}
-
-/// Subtracts corresponding 16-bit unsigned integer values in the input
-/// and returns the differences in the corresponding bytes in the
-/// destination. Differences less than 0x0000 are saturated to 0x0000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the minuends.
-/// \param __b
-/// A 128-bit integer vector containing the subtrahends.
-/// \returns A 128-bit integer vector containing the unsigned integer
-/// differences of the values in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_subs_epu16(__m128i __a, __m128i __b)
-{
-#if (__clang_major__ > 14)
- return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
-#else
- return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
-#endif
-}
-
-/// Performs a bitwise AND of two 128-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the bitwise AND of the values
-/// in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_and_si128(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a & (__v2du)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit integer vectors, using the
-/// one's complement of the values contained in the first source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector containing the left source operand. The one's complement
-/// of this value is used in the bitwise AND.
-/// \param __b
-/// A 128-bit vector containing the right source operand.
-/// \returns A 128-bit integer vector containing the bitwise AND of the one's
-/// complement of the first operand and the values in the second operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_andnot_si128(__m128i __a, __m128i __b)
-{
- return (__m128i)(~(__v2du)__a & (__v2du)__b);
-}
-/// Performs a bitwise OR of two 128-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the bitwise OR of the values
-/// in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_or_si128(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a | (__v2du)__b);
-}
-
-/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 128-bit integer vector containing one of the source operands.
-/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
-/// values in both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_xor_si128(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v2du)__a ^ (__v2du)__b);
-}
-
-/// Left-shifts the 128-bit integer vector operand by the specified
-/// number of bytes. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_slli_si128(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
-///
-/// \param a
-/// A 128-bit integer vector containing the source operand.
-/// \param imm
-/// An immediate value specifying the number of bytes to left-shift operand
-/// \a a.
-/// \returns A 128-bit integer vector containing the left-shifted value.
-#define _mm_slli_si128(a, imm) \
- ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
-
-#define _mm_bslli_si128(a, imm) \
- ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
-
-/// Left-shifts each 16-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to left-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_slli_epi16(__m128i __a, int __count)
-{
- return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
-}
-
-/// Left-shifts each 16-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to left-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sll_epi16(__m128i __a, __m128i __count)
-{
- return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
-}
-
-/// Left-shifts each 32-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to left-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_slli_epi32(__m128i __a, int __count)
-{
- return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
-}
-
-/// Left-shifts each 32-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to left-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sll_epi32(__m128i __a, __m128i __count)
-{
- return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
-}
-
-/// Left-shifts each 64-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to left-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_slli_epi64(__m128i __a, int __count)
-{
- return __builtin_ia32_psllqi128((__v2di)__a, __count);
-}
-
-/// Left-shifts each 64-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. Low-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to left-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the left-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sll_epi64(__m128i __a, __m128i __count)
-{
- return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
-}
-
-/// Right-shifts each 16-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. High-order bits are filled with the sign
-/// bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to right-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srai_epi16(__m128i __a, int __count)
-{
- return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
-}
-
-/// Right-shifts each 16-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. High-order bits are filled with the sign
-/// bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sra_epi16(__m128i __a, __m128i __count)
-{
- return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
-}
-
-/// Right-shifts each 32-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. High-order bits are filled with the sign
-/// bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to right-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srai_epi32(__m128i __a, int __count)
-{
- return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
-}
-
-/// Right-shifts each 32-bit value in the 128-bit integer vector operand
-/// by the specified number of bits. High-order bits are filled with the sign
-/// bit of the initial value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sra_epi32(__m128i __a, __m128i __count)
-{
- return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
-}
-
-/// Right-shifts the 128-bit integer vector operand by the specified
-/// number of bytes. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_srli_si128(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
-///
-/// \param a
-/// A 128-bit integer vector containing the source operand.
-/// \param imm
-/// An immediate value specifying the number of bytes to right-shift operand
-/// \a a.
-/// \returns A 128-bit integer vector containing the right-shifted value.
-#define _mm_srli_si128(a, imm) \
- ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
-
-#define _mm_bsrli_si128(a, imm) \
- ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
-
-/// Right-shifts each of 16-bit values in the 128-bit integer vector
-/// operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to right-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srli_epi16(__m128i __a, int __count)
-{
- return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
-}
-
-/// Right-shifts each of 16-bit values in the 128-bit integer vector
-/// operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srl_epi16(__m128i __a, __m128i __count)
-{
- return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
-}
-
-/// Right-shifts each of 32-bit values in the 128-bit integer vector
-/// operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to right-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srli_epi32(__m128i __a, int __count)
-{
- return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
-}
-
-/// Right-shifts each of 32-bit values in the 128-bit integer vector
-/// operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srl_epi32(__m128i __a, __m128i __count)
-{
- return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
-}
-
-/// Right-shifts each of 64-bit values in the 128-bit integer vector
-/// operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// An integer value specifying the number of bits to right-shift each value
-/// in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srli_epi64(__m128i __a, int __count)
-{
- return __builtin_ia32_psrlqi128((__v2di)__a, __count);
-}
-
-/// Right-shifts each of 64-bit values in the 128-bit integer vector
-/// operand by the specified number of bits. High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the source operand.
-/// \param __count
-/// A 128-bit integer vector in which bits [63:0] specify the number of bits
-/// to right-shift each value in operand \a __a.
-/// \returns A 128-bit integer vector containing the right-shifted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_srl_epi64(__m128i __a, __m128i __count)
-{
- return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
-}
-
-/// Compares each of the corresponding 8-bit values of the 128-bit
-/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
-/// for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v16qi)__a == (__v16qi)__b);
-}
-
-/// Compares each of the corresponding 16-bit values of the 128-bit
-/// integer vectors for equality. Each comparison yields 0x0 for false,
-/// 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v8hi)__a == (__v8hi)__b);
-}
-
-/// Compares each of the corresponding 32-bit values of the 128-bit
-/// integer vectors for equality. Each comparison yields 0x0 for false,
-/// 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v4si)__a == (__v4si)__b);
-}
-
-/// Compares each of the corresponding signed 8-bit values of the 128-bit
-/// integer vectors to determine if the values in the first operand are
-/// greater than those in the second operand. Each comparison yields 0x0 for
-/// false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi8(__m128i __a, __m128i __b)
-{
- /* This function always performs a signed comparison, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m128i)((__v16qs)__a > (__v16qs)__b);
-}
-
-/// Compares each of the corresponding signed 16-bit values of the
-/// 128-bit integer vectors to determine if the values in the first operand
-/// are greater than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v8hi)__a > (__v8hi)__b);
-}
-
-/// Compares each of the corresponding signed 32-bit values of the
-/// 128-bit integer vectors to determine if the values in the first operand
-/// are greater than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)((__v4si)__a > (__v4si)__b);
-}
-
-/// Compares each of the corresponding signed 8-bit values of the 128-bit
-/// integer vectors to determine if the values in the first operand are less
-/// than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmplt_epi8(__m128i __a, __m128i __b)
-{
- return _mm_cmpgt_epi8(__b, __a);
-}
-
-/// Compares each of the corresponding signed 16-bit values of the
-/// 128-bit integer vectors to determine if the values in the first operand
-/// are less than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmplt_epi16(__m128i __a, __m128i __b)
-{
- return _mm_cmpgt_epi16(__b, __a);
-}
-
-/// Compares each of the corresponding signed 32-bit values of the
-/// 128-bit integer vectors to determine if the values in the first operand
-/// are less than those in the second operand.
-///
-/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __b
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmplt_epi32(__m128i __a, __m128i __b)
-{
- return _mm_cmpgt_epi32(__b, __a);
-}
-
-#ifdef __x86_64__
-/// Converts a 64-bit signed integer value from the second operand into a
-/// double-precision value and returns it in the lower element of a [2 x
-/// double] vector; the upper element of the returned vector is copied from
-/// the upper element of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
-/// copied to the upper 64 bits of the destination.
-/// \param __b
-/// A 64-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
-/// converted value of the second operand. The upper 64 bits are copied from
-/// the upper 64 bits of the first operand.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_cvtsi64_sd(__m128d __a, long long __b)
-{
- __a[0] = __b;
- return __a;
-}
-
-/// Converts the first (lower) element of a vector of [2 x double] into a
-/// 64-bit signed integer value, according to the current rounding mode.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-/// conversion.
-/// \returns A 64-bit signed integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvtsd_si64(__m128d __a)
-{
- return __builtin_ia32_cvtsd2si64((__v2df)__a);
-}
-
-/// Converts the first (lower) element of a vector of [2 x double] into a
-/// 64-bit signed integer value, truncating the result when it is inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
-/// conversion.
-/// \returns A 64-bit signed integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvttsd_si64(__m128d __a)
-{
- return __builtin_ia32_cvttsd2si64((__v2df)__a);
-}
-#endif
-
-/// Converts a vector of [4 x i32] into a vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \returns A 128-bit vector of [4 x float] containing the converted values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtepi32_ps(__m128i __a)
-{
- return (__m128)__builtin_convertvector((__v4si)__a, __v4sf);
-}
-
-/// Converts a vector of [4 x float] into a vector of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit integer vector of [4 x i32] containing the converted
-/// values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtps_epi32(__m128 __a)
-{
- return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
-}
-
-/// Converts a vector of [4 x float] into a vector of [4 x i32],
-/// truncating the result when it is inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x i32] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvttps_epi32(__m128 __a)
-{
- return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
-}
-
-/// Returns a vector of [4 x i32] where the lowest element is the input
-/// operand and the remaining elements are zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __a
-/// A 32-bit signed integer operand.
-/// \returns A 128-bit vector of [4 x i32].
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtsi32_si128(int __a)
-{
- return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 };
-}
-
-#ifdef __x86_64__
-/// Returns a vector of [2 x i64] where the lower element is the input
-/// operand and the upper element is zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __a
-/// A 64-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [2 x i64] containing the converted value.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtsi64_si128(long long __a)
-{
- return __extension__ (__m128i)(__v2di){ __a, 0 };
-}
-#endif
-
-/// Moves the least significant 32 bits of a vector of [4 x i32] to a
-/// 32-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __a
-/// A vector of [4 x i32]. The least significant 32 bits are moved to the
-/// destination.
-/// \returns A 32-bit signed integer containing the moved value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtsi128_si32(__m128i __a)
-{
- __v4si __b = (__v4si)__a;
- return __b[0];
-}
-
-#ifdef __x86_64__
-/// Moves the least significant 64 bits of a vector of [2 x i64] to a
-/// 64-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __a
-/// A vector of [2 x i64]. The least significant 64 bits are moved to the
-/// destination.
-/// \returns A 64-bit signed integer containing the moved value.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvtsi128_si64(__m128i __a)
-{
- return __a[0];
-}
-#endif
-
-/// Moves packed integer values from an aligned 128-bit memory location
-/// to elements in a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
-///
-/// \param __p
-/// An aligned pointer to a memory location containing integer values.
-/// \returns A 128-bit integer vector containing the moved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_load_si128(__m128i const *__p)
-{
- return *__p;
-}
-
-/// Moves packed integer values from an unaligned 128-bit memory location
-/// to elements in a 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location containing integer values.
-/// \returns A 128-bit integer vector containing the moved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadu_si128(__m128i_u const *__p)
-{
- struct __loadu_si128 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_si128*)__p)->__v;
-}
-
-/// Returns a vector of [2 x i64] where the lower element is taken from
-/// the lower element of the operand, and the upper element is zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __p
-/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
-/// the destination.
-/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
-/// moved value. The higher order bits are cleared.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_loadl_epi64(__m128i_u const *__p)
-{
- struct __mm_loadl_epi64_struct {
- long long __u;
- } __attribute__((__packed__, __may_alias__));
- return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0};
-}
-
-/// Generates a 128-bit vector of [4 x i32] with unspecified content.
-/// This could be used as an argument to another intrinsic function where the
-/// argument is required but the value is not actually used.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 128-bit vector of [4 x i32] with unspecified content.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_undefined_si128(void)
-{
- return (__m128i)__builtin_ia32_undef128();
-}
-
-/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
-/// the specified 64-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __q1
-/// A 64-bit integer value used to initialize the upper 64 bits of the
-/// destination vector of [2 x i64].
-/// \param __q0
-/// A 64-bit integer value used to initialize the lower 64 bits of the
-/// destination vector of [2 x i64].
-/// \returns An initialized 128-bit vector of [2 x i64] containing the values
-/// provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set_epi64x(long long __q1, long long __q0)
-{
- return __extension__ (__m128i)(__v2di){ __q0, __q1 };
-}
-
-/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
-/// the specified 64-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __q1
-/// A 64-bit integer value used to initialize the upper 64 bits of the
-/// destination vector of [2 x i64].
-/// \param __q0
-/// A 64-bit integer value used to initialize the lower 64 bits of the
-/// destination vector of [2 x i64].
-/// \returns An initialized 128-bit vector of [2 x i64] containing the values
-/// provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set_epi64(__m64 __q1, __m64 __q0)
-{
- return _mm_set_epi64x((long long)__q1, (long long)__q0);
-}
-
-/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
-/// the specified 32-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i3
-/// A 32-bit integer value used to initialize bits [127:96] of the
-/// destination vector.
-/// \param __i2
-/// A 32-bit integer value used to initialize bits [95:64] of the destination
-/// vector.
-/// \param __i1
-/// A 32-bit integer value used to initialize bits [63:32] of the destination
-/// vector.
-/// \param __i0
-/// A 32-bit integer value used to initialize bits [31:0] of the destination
-/// vector.
-/// \returns An initialized 128-bit vector of [4 x i32] containing the values
-/// provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set_epi32(int __i3, int __i2, int __i1, int __i0)
-{
- return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
-}
-
-/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
-/// the specified 16-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __w7
-/// A 16-bit integer value used to initialize bits [127:112] of the
-/// destination vector.
-/// \param __w6
-/// A 16-bit integer value used to initialize bits [111:96] of the
-/// destination vector.
-/// \param __w5
-/// A 16-bit integer value used to initialize bits [95:80] of the destination
-/// vector.
-/// \param __w4
-/// A 16-bit integer value used to initialize bits [79:64] of the destination
-/// vector.
-/// \param __w3
-/// A 16-bit integer value used to initialize bits [63:48] of the destination
-/// vector.
-/// \param __w2
-/// A 16-bit integer value used to initialize bits [47:32] of the destination
-/// vector.
-/// \param __w1
-/// A 16-bit integer value used to initialize bits [31:16] of the destination
-/// vector.
-/// \param __w0
-/// A 16-bit integer value used to initialize bits [15:0] of the destination
-/// vector.
-/// \returns An initialized 128-bit vector of [8 x i16] containing the values
-/// provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0)
-{
- return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
-}
-
-/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
-/// the specified 8-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b15
-/// Initializes bits [127:120] of the destination vector.
-/// \param __b14
-/// Initializes bits [119:112] of the destination vector.
-/// \param __b13
-/// Initializes bits [111:104] of the destination vector.
-/// \param __b12
-/// Initializes bits [103:96] of the destination vector.
-/// \param __b11
-/// Initializes bits [95:88] of the destination vector.
-/// \param __b10
-/// Initializes bits [87:80] of the destination vector.
-/// \param __b9
-/// Initializes bits [79:72] of the destination vector.
-/// \param __b8
-/// Initializes bits [71:64] of the destination vector.
-/// \param __b7
-/// Initializes bits [63:56] of the destination vector.
-/// \param __b6
-/// Initializes bits [55:48] of the destination vector.
-/// \param __b5
-/// Initializes bits [47:40] of the destination vector.
-/// \param __b4
-/// Initializes bits [39:32] of the destination vector.
-/// \param __b3
-/// Initializes bits [31:24] of the destination vector.
-/// \param __b2
-/// Initializes bits [23:16] of the destination vector.
-/// \param __b1
-/// Initializes bits [15:8] of the destination vector.
-/// \param __b0
-/// Initializes bits [7:0] of the destination vector.
-/// \returns An initialized 128-bit vector of [16 x i8] containing the values
-/// provided in the operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0)
-{
- return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
-}
-
-/// Initializes both values in a 128-bit integer vector with the
-/// specified 64-bit integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __q
-/// Integer value used to initialize the elements of the destination integer
-/// vector.
-/// \returns An initialized 128-bit integer vector of [2 x i64] with both
-/// elements containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set1_epi64x(long long __q)
-{
- return _mm_set_epi64x(__q, __q);
-}
-
-/// Initializes both values in a 128-bit vector of [2 x i64] with the
-/// specified 64-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __q
-/// A 64-bit value used to initialize the elements of the destination integer
-/// vector.
-/// \returns An initialized 128-bit vector of [2 x i64] with all elements
-/// containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set1_epi64(__m64 __q)
-{
- return _mm_set_epi64(__q, __q);
-}
-
-/// Initializes all values in a 128-bit vector of [4 x i32] with the
-/// specified 32-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i
-/// A 32-bit value used to initialize the elements of the destination integer
-/// vector.
-/// \returns An initialized 128-bit vector of [4 x i32] with all elements
-/// containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set1_epi32(int __i)
-{
- return _mm_set_epi32(__i, __i, __i, __i);
-}
-
-/// Initializes all values in a 128-bit vector of [8 x i16] with the
-/// specified 16-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __w
-/// A 16-bit value used to initialize the elements of the destination integer
-/// vector.
-/// \returns An initialized 128-bit vector of [8 x i16] with all elements
-/// containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set1_epi16(short __w)
-{
- return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
-}
-
-/// Initializes all values in a 128-bit vector of [16 x i8] with the
-/// specified 8-bit value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b
-/// An 8-bit value used to initialize the elements of the destination integer
-/// vector.
-/// \returns An initialized 128-bit vector of [16 x i8] with all elements
-/// containing the value provided in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_set1_epi8(char __b)
-{
- return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-/// with the specified 64-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __q0
-/// A 64-bit integral value used to initialize the lower 64 bits of the
-/// result.
-/// \param __q1
-/// A 64-bit integral value used to initialize the upper 64 bits of the
-/// result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_setr_epi64(__m64 __q0, __m64 __q1)
-{
- return _mm_set_epi64(__q1, __q0);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-/// with the specified 32-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i0
-/// A 32-bit integral value used to initialize bits [31:0] of the result.
-/// \param __i1
-/// A 32-bit integral value used to initialize bits [63:32] of the result.
-/// \param __i2
-/// A 32-bit integral value used to initialize bits [95:64] of the result.
-/// \param __i3
-/// A 32-bit integral value used to initialize bits [127:96] of the result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
-{
- return _mm_set_epi32(__i3, __i2, __i1, __i0);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-/// with the specified 16-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __w0
-/// A 16-bit integral value used to initialize bits [15:0] of the result.
-/// \param __w1
-/// A 16-bit integral value used to initialize bits [31:16] of the result.
-/// \param __w2
-/// A 16-bit integral value used to initialize bits [47:32] of the result.
-/// \param __w3
-/// A 16-bit integral value used to initialize bits [63:48] of the result.
-/// \param __w4
-/// A 16-bit integral value used to initialize bits [79:64] of the result.
-/// \param __w5
-/// A 16-bit integral value used to initialize bits [95:80] of the result.
-/// \param __w6
-/// A 16-bit integral value used to initialize bits [111:96] of the result.
-/// \param __w7
-/// A 16-bit integral value used to initialize bits [127:112] of the result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
-{
- return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
-}
-
-/// Constructs a 128-bit integer vector, initialized in reverse order
-/// with the specified 8-bit integral values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b0
-/// An 8-bit integral value used to initialize bits [7:0] of the result.
-/// \param __b1
-/// An 8-bit integral value used to initialize bits [15:8] of the result.
-/// \param __b2
-/// An 8-bit integral value used to initialize bits [23:16] of the result.
-/// \param __b3
-/// An 8-bit integral value used to initialize bits [31:24] of the result.
-/// \param __b4
-/// An 8-bit integral value used to initialize bits [39:32] of the result.
-/// \param __b5
-/// An 8-bit integral value used to initialize bits [47:40] of the result.
-/// \param __b6
-/// An 8-bit integral value used to initialize bits [55:48] of the result.
-/// \param __b7
-/// An 8-bit integral value used to initialize bits [63:56] of the result.
-/// \param __b8
-/// An 8-bit integral value used to initialize bits [71:64] of the result.
-/// \param __b9
-/// An 8-bit integral value used to initialize bits [79:72] of the result.
-/// \param __b10
-/// An 8-bit integral value used to initialize bits [87:80] of the result.
-/// \param __b11
-/// An 8-bit integral value used to initialize bits [95:88] of the result.
-/// \param __b12
-/// An 8-bit integral value used to initialize bits [103:96] of the result.
-/// \param __b13
-/// An 8-bit integral value used to initialize bits [111:104] of the result.
-/// \param __b14
-/// An 8-bit integral value used to initialize bits [119:112] of the result.
-/// \param __b15
-/// An 8-bit integral value used to initialize bits [127:120] of the result.
-/// \returns An initialized 128-bit integer vector.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
-{
- return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
-}
-
-/// Creates a 128-bit integer vector initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
-///
-/// \returns An initialized 128-bit integer vector with all elements set to
-/// zero.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_setzero_si128(void)
-{
- return __extension__ (__m128i)(__v2di){ 0LL, 0LL };
-}
-
-/// Stores a 128-bit integer vector to a memory location aligned on a
-/// 128-bit boundary.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
-///
-/// \param __p
-/// A pointer to an aligned memory location that will receive the integer
-/// values.
-/// \param __b
-/// A 128-bit integer vector containing the values to be moved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_si128(__m128i *__p, __m128i __b)
-{
- *__p = __b;
-}
-
-/// Stores a 128-bit integer vector to an unaligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the integer values.
-/// \param __b
-/// A 128-bit integer vector containing the values to be moved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si128(__m128i_u *__p, __m128i __b)
-{
- struct __storeu_si128 {
- __m128i_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_si128*)__p)->__v = __b;
-}
-
-/// Stores a 64-bit integer value from the low element of a 128-bit integer
-/// vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __p
-/// A pointer to a 64-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \param __b
-/// A 128-bit integer vector containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si64(void *__p, __m128i __b)
-{
- struct __storeu_si64 {
- long long __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
-}
-
-/// Stores a 32-bit integer value from the low element of a 128-bit integer
-/// vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
-///
-/// \param __p
-/// A pointer to a 32-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \param __b
-/// A 128-bit integer vector containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si32(void *__p, __m128i __b)
-{
- struct __storeu_si32 {
- int __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
-}
-
-/// Stores a 16-bit integer value from the low element of a 128-bit integer
-/// vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic does not correspond to a specific instruction.
-///
-/// \param __p
-/// A pointer to a 16-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \param __b
-/// A 128-bit integer vector containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si16(void *__p, __m128i __b)
-{
- struct __storeu_si16 {
- short __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
-}
-
-/// Moves bytes selected by the mask from the first operand to the
-/// specified unaligned memory location. When a mask bit is 1, the
-/// corresponding byte is written, otherwise it is not written.
-///
-/// To minimize caching, the data is flagged as non-temporal (unlikely to be
-/// used again soon). Exception and trap behavior for elements not selected
-/// for storage to memory are implementation dependent.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
-/// instruction.
-///
-/// \param __d
-/// A 128-bit integer vector containing the values to be moved.
-/// \param __n
-/// A 128-bit integer vector containing the mask. The most significant bit of
-/// each byte represents the mask bits.
-/// \param __p
-/// A pointer to an unaligned 128-bit memory location where the specified
-/// values are moved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
-{
- __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
-}
-
-/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
-/// a memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 64-bit memory location that will receive the lower 64 bits
-/// of the integer vector parameter.
-/// \param __a
-/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
-/// value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storel_epi64(__m128i_u *__p, __m128i __a)
-{
- struct __mm_storel_epi64_struct {
- long long __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
-}
-
-/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
-/// aligned memory location.
-///
-/// To minimize caching, the data is flagged as non-temporal (unlikely to be
-/// used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
-///
-/// \param __p
-/// A pointer to the 128-bit aligned memory location used to store the value.
-/// \param __a
-/// A vector of [2 x double] containing the 64-bit values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_pd(double *__p, __m128d __a)
-{
- __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
-}
-
-/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
-///
-/// To minimize caching, the data is flagged as non-temporal (unlikely to be
-/// used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
-///
-/// \param __p
-/// A pointer to the 128-bit aligned memory location used to store the value.
-/// \param __a
-/// A 128-bit integer vector containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_si128(__m128i *__p, __m128i __a)
-{
- __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
-}
-
-/// Stores a 32-bit integer value in the specified memory location.
-///
-/// To minimize caching, the data is flagged as non-temporal (unlikely to be
-/// used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
-///
-/// \param __p
-/// A pointer to the 32-bit memory location used to store the value.
-/// \param __a
-/// A 32-bit integer containing the value to be stored.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-_mm_stream_si32(int *__p, int __a)
-{
- __builtin_ia32_movnti(__p, __a);
-}
-
-#ifdef __x86_64__
-/// Stores a 64-bit integer value in the specified memory location.
-///
-/// To minimize caching, the data is flagged as non-temporal (unlikely to be
-/// used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
-///
-/// \param __p
-/// A pointer to the 64-bit memory location used to store the value.
-/// \param __a
-/// A 64-bit integer containing the value to be stored.
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2")))
-_mm_stream_si64(long long *__p, long long __a)
-{
- __builtin_ia32_movnti64(__p, __a);
-}
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// The cache line containing \a __p is flushed and invalidated from all
-/// caches in the coherency domain.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
-///
-/// \param __p
-/// A pointer to the memory location used to identify the cache line to be
-/// flushed.
-void _mm_clflush(void const * __p);
-
-/// Forces strong memory ordering (serialization) between load
-/// instructions preceding this instruction and load instructions following
-/// this instruction, ensuring the system completes all previous loads before
-/// executing subsequent loads.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
-///
-void _mm_lfence(void);
-
-/// Forces strong memory ordering (serialization) between load and store
-/// instructions preceding this instruction and load and store instructions
-/// following this instruction, ensuring that the system completes all
-/// previous memory accesses before executing subsequent memory accesses.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
-///
-void _mm_mfence(void);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-/// Converts 16-bit signed integers from both 128-bit integer vector
-/// operands into 8-bit signed integers, and packs the results into the
-/// destination. Positive values greater than 0x7F are saturated to 0x7F.
-/// Negative values less than 0x80 are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-/// a signed integer and is converted to a 8-bit signed integer with
-/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
-/// written to the lower 64 bits of the result.
-/// \param __b
-/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-/// a signed integer and is converted to a 8-bit signed integer with
-/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
-/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
-/// written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_packs_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Converts 32-bit signed integers from both 128-bit integer vector
-/// operands into 16-bit signed integers, and packs the results into the
-/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-/// Negative values less than 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-/// a signed integer and is converted to a 16-bit signed integer with
-/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
-/// are written to the lower 64 bits of the result.
-/// \param __b
-/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
-/// a signed integer and is converted to a 16-bit signed integer with
-/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
-/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
-/// are written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_packs_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
-}
-
-/// Converts 16-bit signed integers from both 128-bit integer vector
-/// operands into 8-bit unsigned integers, and packs the results into the
-/// destination. Values greater than 0xFF are saturated to 0xFF. Values less
-/// than 0x00 are saturated to 0x00.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-/// a signed integer and is converted to an 8-bit unsigned integer with
-/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
-/// written to the lower 64 bits of the result.
-/// \param __b
-/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
-/// a signed integer and is converted to an 8-bit unsigned integer with
-/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
-/// written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_packus_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
-/// the immediate-value parameter as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \param __imm
-/// An immediate value. Bits [2:0] selects values from \a __a to be assigned
-/// to bits[15:0] of the result. \n
-/// 000: assign values from bits [15:0] of \a __a. \n
-/// 001: assign values from bits [31:16] of \a __a. \n
-/// 010: assign values from bits [47:32] of \a __a. \n
-/// 011: assign values from bits [63:48] of \a __a. \n
-/// 100: assign values from bits [79:64] of \a __a. \n
-/// 101: assign values from bits [95:80] of \a __a. \n
-/// 110: assign values from bits [111:96] of \a __a. \n
-/// 111: assign values from bits [127:112] of \a __a.
-/// \returns An integer, whose lower 16 bits are selected from the 128-bit
-/// integer vector parameter and the remaining bits are assigned zeros.
-#define _mm_extract_epi16(a, imm) \
- ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
- (int)(imm)))
-
-/// Constructs a 128-bit integer vector by first making a copy of the
-/// 128-bit integer vector parameter, and then inserting the lower 16 bits
-/// of an integer parameter into an offset specified by the immediate-value
-/// parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
-/// result and then one of the eight elements in the result is replaced by
-/// the lower 16 bits of \a __b.
-/// \param __b
-/// An integer. The lower 16 bits of this parameter are written to the
-/// result beginning at an offset specified by \a __imm.
-/// \param __imm
-/// An immediate value specifying the bit offset in the result at which the
-/// lower 16 bits of \a __b are written.
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi16(a, b, imm) \
- ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
- (int)(imm)))
-
-/// Copies the values of the most significant bits from each 8-bit
-/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
-/// value, zero-extends the value, and writes it to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the values with bits to be extracted.
-/// \returns The most significant bits from each 8-bit element in \a __a,
-/// written to bits [15:0]. The other bits are assigned zeros.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_movemask_epi8(__m128i __a)
-{
- return __builtin_ia32_pmovmskb128((__v16qi)__a);
-}
-
-/// Constructs a 128-bit integer vector by shuffling four 32-bit
-/// elements of a 128-bit integer vector parameter, using the immediate-value
-/// parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
-///
-/// \param a
-/// A 128-bit integer vector containing the values to be copied.
-/// \param imm
-/// An immediate value containing an 8-bit value specifying which elements to
-/// copy from a. The destinations within the 128-bit destination are assigned
-/// values as follows: \n
-/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
-/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
-/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
-/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
-/// Bit value assignments: \n
-/// 00: assign values from bits [31:0] of \a a. \n
-/// 01: assign values from bits [63:32] of \a a. \n
-/// 10: assign values from bits [95:64] of \a a. \n
-/// 11: assign values from bits [127:96] of \a a.
-/// \returns A 128-bit integer vector containing the shuffled values.
-#define _mm_shuffle_epi32(a, imm) \
- ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
-
-/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
-/// elements of a 128-bit integer vector of [8 x i16], using the immediate
-/// value parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
-///
-/// \param a
-/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
-/// [127:64] of the result.
-/// \param imm
-/// An 8-bit immediate value specifying which elements to copy from \a a. \n
-/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
-/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
-/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
-/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
-/// Bit value assignments: \n
-/// 00: assign values from bits [15:0] of \a a. \n
-/// 01: assign values from bits [31:16] of \a a. \n
-/// 10: assign values from bits [47:32] of \a a. \n
-/// 11: assign values from bits [63:48] of \a a. \n
-/// \returns A 128-bit integer vector containing the shuffled values.
-#define _mm_shufflelo_epi16(a, imm) \
- ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
-
-/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
-/// elements of a 128-bit integer vector of [8 x i16], using the immediate
-/// value parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
-///
-/// \param a
-/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
-/// [63:0] of the result.
-/// \param imm
-/// An 8-bit immediate value specifying which elements to copy from \a a. \n
-/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
-/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
-/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
-/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
-/// Bit value assignments: \n
-/// 00: assign values from bits [79:64] of \a a. \n
-/// 01: assign values from bits [95:80] of \a a. \n
-/// 10: assign values from bits [111:96] of \a a. \n
-/// 11: assign values from bits [127:112] of \a a. \n
-/// \returns A 128-bit integer vector containing the shuffled values.
-#define _mm_shufflehi_epi16(a, imm) \
- ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
-
-/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
-/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [16 x i8].
-/// Bits [71:64] are written to bits [7:0] of the result. \n
-/// Bits [79:72] are written to bits [23:16] of the result. \n
-/// Bits [87:80] are written to bits [39:32] of the result. \n
-/// Bits [95:88] are written to bits [55:48] of the result. \n
-/// Bits [103:96] are written to bits [71:64] of the result. \n
-/// Bits [111:104] are written to bits [87:80] of the result. \n
-/// Bits [119:112] are written to bits [103:96] of the result. \n
-/// Bits [127:120] are written to bits [119:112] of the result.
-/// \param __b
-/// A 128-bit vector of [16 x i8]. \n
-/// Bits [71:64] are written to bits [15:8] of the result. \n
-/// Bits [79:72] are written to bits [31:24] of the result. \n
-/// Bits [87:80] are written to bits [47:40] of the result. \n
-/// Bits [95:88] are written to bits [63:56] of the result. \n
-/// Bits [103:96] are written to bits [79:72] of the result. \n
-/// Bits [111:104] are written to bits [95:88] of the result. \n
-/// Bits [119:112] are written to bits [111:104] of the result. \n
-/// Bits [127:120] are written to bits [127:120] of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpackhi_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
-}
-
-/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
-/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16].
-/// Bits [79:64] are written to bits [15:0] of the result. \n
-/// Bits [95:80] are written to bits [47:32] of the result. \n
-/// Bits [111:96] are written to bits [79:64] of the result. \n
-/// Bits [127:112] are written to bits [111:96] of the result.
-/// \param __b
-/// A 128-bit vector of [8 x i16].
-/// Bits [79:64] are written to bits [31:16] of the result. \n
-/// Bits [95:80] are written to bits [63:48] of the result. \n
-/// Bits [111:96] are written to bits [95:80] of the result. \n
-/// Bits [127:112] are written to bits [127:112] of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpackhi_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
-}
-
-/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
-/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x i32]. \n
-/// Bits [95:64] are written to bits [31:0] of the destination. \n
-/// Bits [127:96] are written to bits [95:64] of the destination.
-/// \param __b
-/// A 128-bit vector of [4 x i32]. \n
-/// Bits [95:64] are written to bits [64:32] of the destination. \n
-/// Bits [127:96] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpackhi_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
-}
-
-/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
-/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x i64]. \n
-/// Bits [127:64] are written to bits [63:0] of the destination.
-/// \param __b
-/// A 128-bit vector of [2 x i64]. \n
-/// Bits [127:64] are written to bits [127:64] of the destination.
-/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpackhi_epi64(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
-}
-
-/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
-/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [16 x i8]. \n
-/// Bits [7:0] are written to bits [7:0] of the result. \n
-/// Bits [15:8] are written to bits [23:16] of the result. \n
-/// Bits [23:16] are written to bits [39:32] of the result. \n
-/// Bits [31:24] are written to bits [55:48] of the result. \n
-/// Bits [39:32] are written to bits [71:64] of the result. \n
-/// Bits [47:40] are written to bits [87:80] of the result. \n
-/// Bits [55:48] are written to bits [103:96] of the result. \n
-/// Bits [63:56] are written to bits [119:112] of the result.
-/// \param __b
-/// A 128-bit vector of [16 x i8].
-/// Bits [7:0] are written to bits [15:8] of the result. \n
-/// Bits [15:8] are written to bits [31:24] of the result. \n
-/// Bits [23:16] are written to bits [47:40] of the result. \n
-/// Bits [31:24] are written to bits [63:56] of the result. \n
-/// Bits [39:32] are written to bits [79:72] of the result. \n
-/// Bits [47:40] are written to bits [95:88] of the result. \n
-/// Bits [55:48] are written to bits [111:104] of the result. \n
-/// Bits [63:56] are written to bits [127:120] of the result.
-/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpacklo_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
-}
-
-/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
-/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
-/// [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16].
-/// Bits [15:0] are written to bits [15:0] of the result. \n
-/// Bits [31:16] are written to bits [47:32] of the result. \n
-/// Bits [47:32] are written to bits [79:64] of the result. \n
-/// Bits [63:48] are written to bits [111:96] of the result.
-/// \param __b
-/// A 128-bit vector of [8 x i16].
-/// Bits [15:0] are written to bits [31:16] of the result. \n
-/// Bits [31:16] are written to bits [63:48] of the result. \n
-/// Bits [47:32] are written to bits [95:80] of the result. \n
-/// Bits [63:48] are written to bits [127:112] of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpacklo_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
-}
-
-/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
-/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x i32]. \n
-/// Bits [31:0] are written to bits [31:0] of the destination. \n
-/// Bits [63:32] are written to bits [95:64] of the destination.
-/// \param __b
-/// A 128-bit vector of [4 x i32]. \n
-/// Bits [31:0] are written to bits [64:32] of the destination. \n
-/// Bits [63:32] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpacklo_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
-}
-
-/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
-/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x i64]. \n
-/// Bits [63:0] are written to bits [63:0] of the destination. \n
-/// \param __b
-/// A 128-bit vector of [2 x i64]. \n
-/// Bits [63:0] are written to bits [127:64] of the destination. \n
-/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_unpacklo_epi64(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
-}
-
-/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
-/// integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector operand. The lower 64 bits are moved to the
-/// destination.
-/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_movepi64_pi64(__m128i __a)
-{
- return (__m64)__a[0];
-}
-
-/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
-/// upper bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
-///
-/// \param __a
-/// A 64-bit value.
-/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
-/// the operand. The upper 64 bits are assigned zeros.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_movpi64_epi64(__m64 __a)
-{
- return __extension__ (__m128i)(__v2di){ (long long)__a, 0 };
-}
-
-/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
-/// integer vector, zeroing the upper bits.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
-///
-/// \param __a
-/// A 128-bit integer vector operand. The lower 64 bits are moved to the
-/// destination.
-/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
-/// the operand. The upper 64 bits are assigned zeros.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_move_epi64(__m128i __a)
-{
- return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
-}
-
-/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
-/// [2 x double] and interleaves them into a 128-bit vector of [2 x
-/// double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. \n
-/// Bits [127:64] are written to bits [63:0] of the destination.
-/// \param __b
-/// A 128-bit vector of [2 x double]. \n
-/// Bits [127:64] are written to bits [127:64] of the destination.
-/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_unpackhi_pd(__m128d __a, __m128d __b)
-{
- return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
-}
-
-/// Unpacks the low-order 64-bit elements from two 128-bit vectors
-/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
-/// double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. \n
-/// Bits [63:0] are written to bits [63:0] of the destination.
-/// \param __b
-/// A 128-bit vector of [2 x double]. \n
-/// Bits [63:0] are written to bits [127:64] of the destination.
-/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_unpacklo_pd(__m128d __a, __m128d __b)
-{
- return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
-}
-
-/// Extracts the sign bits of the double-precision values in the 128-bit
-/// vector of [2 x double], zero-extends the value, and writes it to the
-/// low-order bits of the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the values with sign bits to
-/// be extracted.
-/// \returns The sign bits from each of the double-precision elements in \a __a,
-/// written to bits [1:0]. The remaining bits are assigned values of zero.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_movemask_pd(__m128d __a)
-{
- return __builtin_ia32_movmskpd((__v2df)__a);
-}
-
-
-/// Constructs a 128-bit floating-point vector of [2 x double] from two
-/// 128-bit vector parameters of [2 x double], using the immediate-value
-/// parameter as a specifier.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
-///
-/// \param a
-/// A 128-bit vector of [2 x double].
-/// \param b
-/// A 128-bit vector of [2 x double].
-/// \param i
-/// An 8-bit immediate value. The least significant two bits specify which
-/// elements to copy from \a a and \a b: \n
-/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
-/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
-/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
-/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
-/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
-#define _mm_shuffle_pd(a, b, i) \
- ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
- (int)(i)))
-
-/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
-/// floating-point vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [2 x double].
-/// \returns A 128-bit floating-point vector of [4 x float] containing the same
-/// bitwise pattern as the parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_castpd_ps(__m128d __a)
-{
- return (__m128)__a;
-}
-
-/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
-/// integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [2 x double].
-/// \returns A 128-bit integer vector containing the same bitwise pattern as the
-/// parameter.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_castpd_si128(__m128d __a)
-{
- return (__m128i)__a;
-}
-
-/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
-/// floating-point vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [4 x float].
-/// \returns A 128-bit floating-point vector of [2 x double] containing the same
-/// bitwise pattern as the parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_castps_pd(__m128 __a)
-{
- return (__m128d)__a;
-}
-
-/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
-/// integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [4 x float].
-/// \returns A 128-bit integer vector containing the same bitwise pattern as the
-/// parameter.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_castps_si128(__m128 __a)
-{
- return (__m128i)__a;
-}
-
-/// Casts a 128-bit integer vector into a 128-bit floating-point vector
-/// of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \returns A 128-bit floating-point vector of [4 x float] containing the same
-/// bitwise pattern as the parameter.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_castsi128_ps(__m128i __a)
-{
- return (__m128)__a;
-}
-
-/// Casts a 128-bit integer vector into a 128-bit floating-point vector
-/// of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit integer vector.
-/// \returns A 128-bit floating-point vector of [2 x double] containing the same
-/// bitwise pattern as the parameter.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_castsi128_pd(__m128i __a)
-{
- return (__m128d)__a;
-}
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// Indicates that a spin loop is being executed for the purposes of
-/// optimizing power consumption during the loop.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
-///
-void _mm_pause(void);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
-
-#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
-
-#define _MM_DENORMALS_ZERO_ON (0x0040U)
-#define _MM_DENORMALS_ZERO_OFF (0x0000U)
-
-#define _MM_DENORMALS_ZERO_MASK (0x0040U)
-
-#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
-#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
-
-#endif /* __EMMINTRIN_H */
+++ /dev/null
-/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <enqcmdintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __ENQCMDINTRIN_H
-#define __ENQCMDINTRIN_H
-
-/* Define the default attributes for the functions in this file */
-#define _DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("enqcmd")))
-
-/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
-/// data, and performs 64-byte enqueue store to memory pointed by \a __dst.
-/// This intrinsics may only be used in User mode.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsics corresponds to the <c> ENQCMD </c> instruction.
-///
-/// \param __dst
-/// Pointer to the destination of the enqueue store.
-/// \param __src
-/// Pointer to 64-byte command data.
-/// \returns If the command data is successfully written to \a __dst then 0 is
-/// returned. Otherwise 1 is returned.
-static __inline__ int _DEFAULT_FN_ATTRS
-_enqcmd (void *__dst, const void *__src)
-{
- return __builtin_ia32_enqcmd(__dst, __src);
-}
-
-/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store
-/// data, and performs 64-byte enqueue store to memory pointed by \a __dst
-/// This intrinsic may only be used in Privileged mode.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsics corresponds to the <c> ENQCMDS </c> instruction.
-///
-/// \param __dst
-/// Pointer to the destination of the enqueue store.
-/// \param __src
-/// Pointer to 64-byte command data.
-/// \returns If the command data is successfully written to \a __dst then 0 is
-/// returned. Otherwise 1 is returned.
-static __inline__ int _DEFAULT_FN_ATTRS
-_enqcmds (void *__dst, const void *__src)
-{
- return __builtin_ia32_enqcmds(__dst, __src);
-}
-
-#undef _DEFAULT_FN_ATTRS
-
-#endif /* __ENQCMDINTRIN_H */
+++ /dev/null
-/*===---- f16cintrin.h - F16C intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __IMMINTRIN_H
-#error "Never use <f16cintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __F16CINTRIN_H
-#define __F16CINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 \
- __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 \
- __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256)))
-
-/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h,
- * but that's because icc can emulate these without f16c using a library call.
- * Since we don't do that let's leave these in f16cintrin.h.
- */
-
-/// Converts a 16-bit half-precision float value into a 32-bit float
-/// value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-/// A 16-bit half-precision float value.
-/// \returns The converted 32-bit float value.
-static __inline float __DEFAULT_FN_ATTRS128
-_cvtsh_ss(unsigned short __a)
-{
- __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
- __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
- return __r[0];
-}
-
-/// Converts a 32-bit single-precision float value to a 16-bit
-/// half-precision float value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// unsigned short _cvtss_sh(float a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-/// A 32-bit single-precision float value to be converted to a 16-bit
-/// half-precision float value.
-/// \param imm
-/// An immediate value controlling rounding using bits [2:0]: \n
-/// 000: Nearest \n
-/// 001: Down \n
-/// 010: Up \n
-/// 011: Truncate \n
-/// 1XX: Use MXCSR.RC for rounding
-/// \returns The converted 16-bit half-precision float value.
-#define _cvtss_sh(a, imm) \
- ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \
- (imm)))[0]))
-
-/// Converts a 128-bit vector containing 32-bit float values into a
-/// 128-bit vector containing 16-bit half-precision float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_cvtps_ph(__m128 a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-/// A 128-bit vector containing 32-bit float values.
-/// \param imm
-/// An immediate value controlling rounding using bits [2:0]: \n
-/// 000: Nearest \n
-/// 001: Down \n
-/// 010: Up \n
-/// 011: Truncate \n
-/// 1XX: Use MXCSR.RC for rounding
-/// \returns A 128-bit vector containing converted 16-bit half-precision float
-/// values. The lower 64 bits are used to store the converted 16-bit
-/// half-precision floating-point values.
-#define _mm_cvtps_ph(a, imm) \
- ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm)))
-
-/// Converts a 128-bit vector containing 16-bit half-precision float
-/// values into a 128-bit vector containing 32-bit float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector containing 16-bit half-precision float values. The lower
-/// 64 bits are used in the conversion.
-/// \returns A 128-bit vector of [4 x float] containing converted float values.
-static __inline __m128 __DEFAULT_FN_ATTRS128
-_mm_cvtph_ps(__m128i __a)
-{
- return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
-}
-
-/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
-/// containing 16-bit half-precision float values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm256_cvtps_ph(__m256 a, const int imm);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VCVTPS2PH </c> instruction.
-///
-/// \param a
-/// A 256-bit vector containing 32-bit single-precision float values to be
-/// converted to 16-bit half-precision float values.
-/// \param imm
-/// An immediate value controlling rounding using bits [2:0]: \n
-/// 000: Nearest \n
-/// 001: Down \n
-/// 010: Up \n
-/// 011: Truncate \n
-/// 1XX: Use MXCSR.RC for rounding
-/// \returns A 128-bit vector containing the converted 16-bit half-precision
-/// float values.
-#define _mm256_cvtps_ph(a, imm) \
- ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm)))
-
-/// Converts a 128-bit vector containing 16-bit half-precision float
-/// values into a 256-bit vector of [8 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTPH2PS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector containing 16-bit half-precision float values to be
-/// converted to 32-bit single-precision float values.
-/// \returns A vector of [8 x float] containing the converted 32-bit
-/// single-precision float values.
-static __inline __m256 __DEFAULT_FN_ATTRS256
-_mm256_cvtph_ps(__m128i __a)
-{
- return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __F16CINTRIN_H */
+++ /dev/null
-/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __FMA4INTRIN_H
-#define __FMA4INTRIN_H
-
-#include <pmmintrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __FMA4INTRIN_H */
+++ /dev/null
-/*===---- fmaintrin.h - FMA intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __FMAINTRIN_H
-#define __FMAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS128
-_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C)
-{
- return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS128
-_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C)
-{
- return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C)
-{
- return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C)
-{
- return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C);
-}
-
-#undef __DEFAULT_FN_ATTRS128
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __FMAINTRIN_H */
+++ /dev/null
-/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <fxsrintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __FXSRINTRIN_H
-#define __FXSRINTRIN_H
-
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr")))
-
-/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
-/// memory region pointed to by the input parameter \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXSAVE </c> instruction.
-///
-/// \param __p
-/// A pointer to a 512-byte memory region. The beginning of this memory
-/// region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave(void *__p)
-{
- __builtin_ia32_fxsave(__p);
-}
-
-/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
-/// memory region pointed to by the input parameter \a __p. The contents of
-/// this memory region should have been written to by a previous \c _fxsave
-/// or \c _fxsave64 intrinsic.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXRSTOR </c> instruction.
-///
-/// \param __p
-/// A pointer to a 512-byte memory region. The beginning of this memory
-/// region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor(void *__p)
-{
- __builtin_ia32_fxrstor(__p);
-}
-
-#ifdef __x86_64__
-/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte
-/// memory region pointed to by the input parameter \a __p.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXSAVE64 </c> instruction.
-///
-/// \param __p
-/// A pointer to a 512-byte memory region. The beginning of this memory
-/// region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxsave64(void *__p)
-{
- __builtin_ia32_fxsave64(__p);
-}
-
-/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte
-/// memory region pointed to by the input parameter \a __p. The contents of
-/// this memory region should have been written to by a previous \c _fxsave
-/// or \c _fxsave64 intrinsic.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> FXRSTOR64 </c> instruction.
-///
-/// \param __p
-/// A pointer to a 512-byte memory region. The beginning of this memory
-/// region should be aligned on a 16-byte boundary.
-static __inline__ void __DEFAULT_FN_ATTRS
-_fxrstor64(void *__p)
-{
- __builtin_ia32_fxrstor64(__p);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __GFNIINTRIN_H
-#define __GFNIINTRIN_H
-
-/* Default attributes for simple form (no masking). */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128)))
-
-/* Default attributes for YMM unmasked form. */
-#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
-
-/* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
-
-/* Default attributes for VLX forms. */
-#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
-
-#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \
- ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), \
- (char)(I)))
-
-#define _mm_gf2p8affine_epi64_epi8(A, B, I) \
- ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), \
- (char)(I)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_gf2p8mul_epi8(__m128i __A, __m128i __B)
-{
- return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
- (__v16qi) __B);
-}
-
-#ifdef __AVXINTRIN_H
-#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \
- ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
- (__v32qi)(__m256i)(B), \
- (char)(I)))
-
-#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \
- ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \
- (__v32qi)(__m256i)(B), \
- (char)(I)))
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_Y
-_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B)
-{
- return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A,
- (__v32qi) __B);
-}
-#endif /* __AVXINTRIN_H */
-
-#ifdef __AVX512BWINTRIN_H
-#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \
- ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \
- (__v64qi)(__m512i)(B), \
- (char)(I)))
-
-#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
- (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \
- (__v64qi)(__m512i)(S)))
-
-#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
- _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \
- U, A, B, I)
-
-#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \
- ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \
- (__v64qi)(__m512i)(B), \
- (char)(I)))
-
-#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
- ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \
- (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \
- (__v64qi)(__m512i)(S)))
-
-#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
- _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \
- U, A, B, I)
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
-_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A,
- (__v64qi) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
-_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_selectb_512(__U,
- (__v64qi) _mm512_gf2p8mul_epi8(__A, __B),
- (__v64qi) __S);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
-_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
-{
- return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
- __U, __A, __B);
-}
-#endif /* __AVX512BWINTRIN_H */
-
-#ifdef __AVX512VLBWINTRIN_H
-#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
- (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \
- (__v16qi)(__m128i)(S)))
-
-#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
- _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \
- U, A, B, I)
-
-#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
- (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \
- (__v32qi)(__m256i)(S)))
-
-#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \
- _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \
- U, A, B, I)
-
-#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
- ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \
- (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \
- (__v16qi)(__m128i)(S)))
-
-#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
- _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I)
-
-#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \
- ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \
- (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \
- (__v32qi)(__m256i)(S)))
-
-#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \
- _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \
- U, A, B, I)
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B)
-{
- return (__m128i) __builtin_ia32_selectb_128(__U,
- (__v16qi) _mm_gf2p8mul_epi8(__A, __B),
- (__v16qi) __S);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128
-_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B)
-{
- return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(),
- __U, __A, __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B)
-{
- return (__m256i) __builtin_ia32_selectb_256(__U,
- (__v32qi) _mm256_gf2p8mul_epi8(__A, __B),
- (__v32qi) __S);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256
-_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B)
-{
- return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(),
- __U, __A, __B);
-}
-#endif /* __AVX512VLBWINTRIN_H */
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_Y
-#undef __DEFAULT_FN_ATTRS_Z
-#undef __DEFAULT_FN_ATTRS_VL128
-#undef __DEFAULT_FN_ATTRS_VL256
-
-#endif /* __GFNIINTRIN_H */
-
+++ /dev/null
-/*===---------------- hresetintrin.h - HRESET intrinsics -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __X86GPRINTRIN_H
-#error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
-#endif
-
-#ifndef __HRESETINTRIN_H
-#define __HRESETINTRIN_H
-
-#if __has_extension(gnu_asm)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("hreset")))
-
-/// Provides a hint to the processor to selectively reset the prediction
-/// history of the current logical processor specified by a 32-bit integer
-/// value \a __eax.
-///
-/// This intrinsic corresponds to the <c> HRESET </c> instruction.
-///
-/// \operation
-/// IF __eax == 0
-/// // nop
-/// ELSE
-/// FOR i := 0 to 31
-/// IF __eax[i]
-/// ResetPredictionFeature(i)
-/// FI
-/// ENDFOR
-/// FI
-/// \endoperation
-static __inline void __DEFAULT_FN_ATTRS
-_hreset(int __eax)
-{
- __asm__ ("hreset $0" :: "a"(__eax));
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __has_extension(gnu_asm) */
-
-#endif /* __HRESETINTRIN_H */
+++ /dev/null
-/* ===-------- ia32intrin.h ---------------------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <ia32intrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __IA32INTRIN_H
-#define __IA32INTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32")))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
-#else
-#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__))
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
-#endif
-
-/** Find the first set bit starting from the lsb. Result is undefined if
- * input is 0.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> BSF </c> instruction or the
- * <c> TZCNT </c> instruction.
- *
- * \param __A
- * A 32-bit integer operand.
- * \returns A 32-bit integer containing the bit number.
- */
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsfd(int __A) {
- return __builtin_ctz(__A);
-}
-
-/** Find the first set bit starting from the msb. Result is undefined if
- * input is 0.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> BSR </c> instruction or the
- * <c> LZCNT </c> instruction and an <c> XOR </c>.
- *
- * \param __A
- * A 32-bit integer operand.
- * \returns A 32-bit integer containing the bit number.
- */
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsrd(int __A) {
- return 31 - __builtin_clz(__A);
-}
-
-/** Swaps the bytes in the input. Converting little endian to big endian or
- * vice versa.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> BSWAP </c> instruction.
- *
- * \param __A
- * A 32-bit integer operand.
- * \returns A 32-bit integer containing the swapped bytes.
- */
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bswapd(int __A) {
- return __builtin_bswap32(__A);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-_bswap(int __A) {
- return __builtin_bswap32(__A);
-}
-
-#define _bit_scan_forward(A) __bsfd((A))
-#define _bit_scan_reverse(A) __bsrd((A))
-
-#ifdef __x86_64__
-/** Find the first set bit starting from the lsb. Result is undefined if
- * input is 0.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> BSF </c> instruction or the
- * <c> TZCNT </c> instruction.
- *
- * \param __A
- * A 64-bit integer operand.
- * \returns A 32-bit integer containing the bit number.
- */
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsfq(long long __A) {
- return __builtin_ctzll(__A);
-}
-
-/** Find the first set bit starting from the msb. Result is undefined if
- * input is 0.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> BSR </c> instruction or the
- * <c> LZCNT </c> instruction and an <c> XOR </c>.
- *
- * \param __A
- * A 64-bit integer operand.
- * \returns A 32-bit integer containing the bit number.
- */
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__bsrq(long long __A) {
- return 63 - __builtin_clzll(__A);
-}
-
-/** Swaps the bytes in the input. Converting little endian to big endian or
- * vice versa.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> BSWAP </c> instruction.
- *
- * \param __A
- * A 64-bit integer operand.
- * \returns A 64-bit integer containing the swapped bytes.
- */
-static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__bswapq(long long __A) {
- return __builtin_bswap64(__A);
-}
-
-#define _bswap64(A) __bswapq((A))
-#endif
-
-/** Counts the number of bits in the source operand having a value of 1.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> POPCNT </c> instruction or a
- * a sequence of arithmetic and logic ops to calculate it.
- *
- * \param __A
- * An unsigned 32-bit integer operand.
- * \returns A 32-bit integer containing the number of bits with value 1 in the
- * source operand.
- */
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-__popcntd(unsigned int __A)
-{
- return __builtin_popcount(__A);
-}
-
-#define _popcnt32(A) __popcntd((A))
-
-#ifdef __x86_64__
-/** Counts the number of bits in the source operand having a value of 1.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> POPCNT </c> instruction or a
- * a sequence of arithmetic and logic ops to calculate it.
- *
- * \param __A
- * An unsigned 64-bit integer operand.
- * \returns A 64-bit integer containing the number of bits with value 1 in the
- * source operand.
- */
-static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__popcntq(unsigned long long __A)
-{
- return __builtin_popcountll(__A);
-}
-
-#define _popcnt64(A) __popcntq((A))
-#endif /* __x86_64__ */
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__readeflags(void)
-{
- return __builtin_ia32_readeflags_u64();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-__writeeflags(unsigned long long __f)
-{
- __builtin_ia32_writeeflags_u64(__f);
-}
-
-#else /* !__x86_64__ */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__readeflags(void)
-{
- return __builtin_ia32_readeflags_u32();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-__writeeflags(unsigned int __f)
-{
- __builtin_ia32_writeeflags_u32(__f);
-}
-#endif /* !__x86_64__ */
-
-/** Cast a 32-bit float value to a 32-bit unsigned integer value
- *
- * \headerfile <x86intrin.h>
- * This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction in x86_64,
- * and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
- *
- * \param __A
- * A 32-bit float value.
- * \returns a 32-bit unsigned integer containing the converted value.
- */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST
-_castf32_u32(float __A) {
- return __builtin_bit_cast(unsigned int, __A);
-}
-
-/** Cast a 64-bit float value to a 64-bit unsigned integer value
- *
- * \headerfile <x86intrin.h>
- * This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
- * and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
- *
- * \param __A
- * A 64-bit float value.
- * \returns a 64-bit unsigned integer containing the converted value.
- */
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST
-_castf64_u64(double __A) {
- return __builtin_bit_cast(unsigned long long, __A);
-}
-
-/** Cast a 32-bit unsigned integer value to a 32-bit float value
- *
- * \headerfile <x86intrin.h>
- * This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
- * and corresponds to the <c> FLDS </c> instruction in ia32.
- *
- * \param __A
- * A 32-bit unsigned integer value.
- * \returns a 32-bit float value containing the converted value.
- */
-static __inline__ float __DEFAULT_FN_ATTRS_CAST
-_castu32_f32(unsigned int __A) {
- return __builtin_bit_cast(float, __A);
-}
-
-/** Cast a 64-bit unsigned integer value to a 64-bit float value
- *
- * \headerfile <x86intrin.h>
- * This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
- * and corresponds to the <c> FLDL </c> instruction in ia32.
- *
- * \param __A
- * A 64-bit unsigned integer value.
- * \returns a 64-bit float value containing the converted value.
- */
-static __inline__ double __DEFAULT_FN_ATTRS_CAST
-_castu64_f64(unsigned long long __A) {
- return __builtin_bit_cast(double, __A);
-}
-
-/** Adds the unsigned integer operand to the CRC-32C checksum of the
- * unsigned char operand.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> CRC32B </c> instruction.
- *
- * \param __C
- * An unsigned integer operand to add to the CRC-32C checksum of operand
- * \a __D.
- * \param __D
- * An unsigned 8-bit integer operand used to compute the CRC-32C checksum.
- * \returns The result of adding operand \a __C to the CRC-32C checksum of
- * operand \a __D.
- */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
-__crc32b(unsigned int __C, unsigned char __D)
-{
- return __builtin_ia32_crc32qi(__C, __D);
-}
-
-/** Adds the unsigned integer operand to the CRC-32C checksum of the
- * unsigned short operand.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> CRC32W </c> instruction.
- *
- * \param __C
- * An unsigned integer operand to add to the CRC-32C checksum of operand
- * \a __D.
- * \param __D
- * An unsigned 16-bit integer operand used to compute the CRC-32C checksum.
- * \returns The result of adding operand \a __C to the CRC-32C checksum of
- * operand \a __D.
- */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
-__crc32w(unsigned int __C, unsigned short __D)
-{
- return __builtin_ia32_crc32hi(__C, __D);
-}
-
-/** Adds the unsigned integer operand to the CRC-32C checksum of the
- * second unsigned integer operand.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> CRC32D </c> instruction.
- *
- * \param __C
- * An unsigned integer operand to add to the CRC-32C checksum of operand
- * \a __D.
- * \param __D
- * An unsigned 32-bit integer operand used to compute the CRC-32C checksum.
- * \returns The result of adding operand \a __C to the CRC-32C checksum of
- * operand \a __D.
- */
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32
-__crc32d(unsigned int __C, unsigned int __D)
-{
- return __builtin_ia32_crc32si(__C, __D);
-}
-
-#ifdef __x86_64__
-/** Adds the unsigned integer operand to the CRC-32C checksum of the
- * unsigned 64-bit integer operand.
- *
- * \headerfile <x86intrin.h>
- *
- * This intrinsic corresponds to the <c> CRC32Q </c> instruction.
- *
- * \param __C
- * An unsigned integer operand to add to the CRC-32C checksum of operand
- * \a __D.
- * \param __D
- * An unsigned 64-bit integer operand used to compute the CRC-32C checksum.
- * \returns The result of adding operand \a __C to the CRC-32C checksum of
- * operand \a __D.
- */
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32
-__crc32q(unsigned long long __C, unsigned long long __D)
-{
- return __builtin_ia32_crc32di(__C, __D);
-}
-#endif /* __x86_64__ */
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__rdpmc(int __A) {
- return __builtin_ia32_rdpmc(__A);
-}
-
-/* __rdtscp */
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__rdtscp(unsigned int *__A) {
- return __builtin_ia32_rdtscp(__A);
-}
-
-#define _rdtsc() __rdtsc()
-
-#define _rdpmc(A) __rdpmc(A)
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_wbinvd(void) {
- __builtin_ia32_wbinvd();
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
-__rolb(unsigned char __X, int __C) {
- return __builtin_rotateleft8(__X, __C);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR
-__rorb(unsigned char __X, int __C) {
- return __builtin_rotateright8(__X, __C);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
-__rolw(unsigned short __X, int __C) {
- return __builtin_rotateleft16(__X, __C);
-}
-
-static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
-__rorw(unsigned short __X, int __C) {
- return __builtin_rotateright16(__X, __C);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
-__rold(unsigned int __X, int __C) {
- return __builtin_rotateleft32(__X, __C);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
-__rord(unsigned int __X, int __C) {
- return __builtin_rotateright32(__X, __C);
-}
-
-#ifdef __x86_64__
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__rolq(unsigned long long __X, int __C) {
- return __builtin_rotateleft64(__X, __C);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR
-__rorq(unsigned long long __X, int __C) {
- return __builtin_rotateright64(__X, __C);
-}
-#endif /* __x86_64__ */
-
-#ifndef _MSC_VER
-/* These are already provided as builtins for MSVC. */
-/* Select the correct function based on the size of long. */
-#ifdef __LP64__
-#define _lrotl(a,b) __rolq((a), (b))
-#define _lrotr(a,b) __rorq((a), (b))
-#else
-#define _lrotl(a,b) __rold((a), (b))
-#define _lrotr(a,b) __rord((a), (b))
-#endif
-#define _rotl(a,b) __rold((a), (b))
-#define _rotr(a,b) __rord((a), (b))
-#endif // _MSC_VER
-
-/* These are not builtins so need to be provided in all modes. */
-#define _rotwl(a,b) __rolw((a), (b))
-#define _rotwr(a,b) __rorw((a), (b))
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CAST
-#undef __DEFAULT_FN_ATTRS_CRC32
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR
-
-#endif /* __IA32INTRIN_H */
+++ /dev/null
-/*===---- immintrin.h - Intel intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#define __IMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <x86gprintrin.h>
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__MMX__)
-#include <mmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SSE__)
-#include <xmmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SSE2__)
-#include <emmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SSE3__)
-#include <pmmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SSSE3__)
-#include <tmmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__SSE4_2__) || defined(__SSE4_1__))
-#include <smmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AES__) || defined(__PCLMUL__))
-#include <wmmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__CLFLUSHOPT__)
-#include <clflushoptintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__CLWB__)
-#include <clwbintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX__)
-#include <avxintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX2__)
-#include <avx2intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__F16C__)
-#include <f16cintrin.h>
-#endif
-
-/* No feature check desired due to internal checks */
-#include <bmiintrin.h>
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__BMI2__)
-#include <bmi2intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__LZCNT__)
-#include <lzcntintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__POPCNT__)
-#include <popcntintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__FMA__)
-#include <fmaintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512F__)
-#include <avx512fintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512VL__)
-#include <avx512vlintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512BW__)
-#include <avx512bwintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512BITALG__)
-#include <avx512bitalgintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512CD__)
-#include <avx512cdintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512VPOPCNTDQ__)
-#include <avx512vpopcntdqintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__))
-#include <avx512vpopcntdqvlintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512VNNI__)
-#include <avx512vnniintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512VNNI__))
-#include <avx512vlvnniintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVXVNNI__)
-#include <avxvnniintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512DQ__)
-#include <avx512dqintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512BITALG__))
-#include <avx512vlbitalgintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512BW__))
-#include <avx512vlbwintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512CD__))
-#include <avx512vlcdintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512DQ__))
-#include <avx512vldqintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512ER__)
-#include <avx512erintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512IFMA__)
-#include <avx512ifmaintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512IFMA__) && defined(__AVX512VL__))
-#include <avx512ifmavlintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512VBMI__)
-#include <avx512vbmiintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VBMI__) && defined(__AVX512VL__))
-#include <avx512vbmivlintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512VBMI2__)
-#include <avx512vbmi2intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VBMI2__) && defined(__AVX512VL__))
-#include <avx512vlvbmi2intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512PF__)
-#include <avx512pfintrin.h>
-#endif
-
-/*
- * FIXME: _Float16 type is legal only when HW support float16 operation.
- * We use __AVX512FP16__ to identify if float16 is supported or not, so
- * when float16 is not supported, the related header is not included.
- *
- */
-#if defined(__AVX512FP16__)
-#include <avx512fp16intrin.h>
-#endif
-
-#if defined(__AVX512FP16__) && defined(__AVX512VL__)
-#include <avx512vlfp16intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512BF16__)
-#include <avx512bf16intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512BF16__))
-#include <avx512vlbf16intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__PKU__)
-#include <pkuintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__VPCLMULQDQ__)
-#include <vpclmulqdqintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__VAES__)
-#include <vaesintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__GFNI__)
-#include <gfniintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__RDPID__)
-/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
-///
-/// \headerfile <immintrin.h>
-///
-/// This intrinsic corresponds to the <c> RDPID </c> instruction.
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
-_rdpid_u32(void) {
- return __builtin_ia32_rdpid();
-}
-#endif // __RDPID__
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__RDRND__)
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand16_step(unsigned short *__p)
-{
- return __builtin_ia32_rdrand16_step(__p);
-}
-
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand32_step(unsigned int *__p)
-{
- return __builtin_ia32_rdrand32_step(__p);
-}
-
-#ifdef __x86_64__
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
-_rdrand64_step(unsigned long long *__p)
-{
- return __builtin_ia32_rdrand64_step(__p);
-}
-#endif
-#endif /* __RDRND__ */
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__FSGSBASE__)
-#ifdef __x86_64__
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readfsbase_u32(void)
-{
- return __builtin_ia32_rdfsbase32();
-}
-
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readfsbase_u64(void)
-{
- return __builtin_ia32_rdfsbase64();
-}
-
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readgsbase_u32(void)
-{
- return __builtin_ia32_rdgsbase32();
-}
-
-static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_readgsbase_u64(void)
-{
- return __builtin_ia32_rdgsbase64();
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writefsbase_u32(unsigned int __V)
-{
- __builtin_ia32_wrfsbase32(__V);
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writefsbase_u64(unsigned long long __V)
-{
- __builtin_ia32_wrfsbase64(__V);
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writegsbase_u32(unsigned int __V)
-{
- __builtin_ia32_wrgsbase32(__V);
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase")))
-_writegsbase_u64(unsigned long long __V)
-{
- __builtin_ia32_wrgsbase64(__V);
-}
-
-#endif
-#endif /* __FSGSBASE__ */
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__MOVBE__)
-
-/* The structs used below are to force the load/store to be unaligned. This
- * is accomplished with the __packed__ attribute. The __may_alias__ prevents
- * tbaa metadata from being generated based on the struct and the type of the
- * field inside of it.
- */
-
-static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_loadbe_i16(void const * __P) {
- struct __loadu_i16 {
- short __v;
- } __attribute__((__packed__, __may_alias__));
- return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v);
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_storebe_i16(void * __P, short __D) {
- struct __storeu_i16 {
- short __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D);
-}
-
-static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_loadbe_i32(void const * __P) {
- struct __loadu_i32 {
- int __v;
- } __attribute__((__packed__, __may_alias__));
- return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v);
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_storebe_i32(void * __P, int __D) {
- struct __storeu_i32 {
- int __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D);
-}
-
-#ifdef __x86_64__
-static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_loadbe_i64(void const * __P) {
- struct __loadu_i64 {
- long long __v;
- } __attribute__((__packed__, __may_alias__));
- return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v);
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe")))
-_storebe_i64(void * __P, long long __D) {
- struct __storeu_i64 {
- long long __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D);
-}
-#endif
-#endif /* __MOVBE */
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__RTM__)
-#include <rtmintrin.h>
-#include <xtestintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SHA__)
-#include <shaintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__FXSR__)
-#include <fxsrintrin.h>
-#endif
-
-/* No feature check desired due to internal MSC_VER checks */
-#include <xsaveintrin.h>
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__XSAVEOPT__)
-#include <xsaveoptintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__XSAVEC__)
-#include <xsavecintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__XSAVES__)
-#include <xsavesintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SHSTK__)
-#include <cetintrin.h>
-#endif
-
-/* Some intrinsics inside adxintrin.h are available only on processors with ADX,
- * whereas others are also available at all times. */
-#include <adxintrin.h>
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__RDSEED__)
-#include <rdseedintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__WBNOINVD__)
-#include <wbnoinvdintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__CLDEMOTE__)
-#include <cldemoteintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__WAITPKG__)
-#include <waitpkgintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__MOVDIRI__) || defined(__MOVDIR64B__)
-#include <movdirintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__PCONFIG__)
-#include <pconfigintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SGX__)
-#include <sgxintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__PTWRITE__)
-#include <ptwriteintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__INVPCID__)
-#include <invpcidintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__KL__) || defined(__WIDEKL__)
-#include <keylockerintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__)
-#include <amxintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__AVX512VP2INTERSECT__)
-#include <avx512vp2intersectintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__))
-#include <avx512vlvp2intersectintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__ENQCMD__)
-#include <enqcmdintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SERIALIZE__)
-#include <serializeintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__TSXLDTRK__)
-#include <tsxldtrkintrin.h>
-#endif
-
-#if defined(_MSC_VER) && __has_extension(gnu_asm)
-/* Define the default attributes for these intrinsics */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Exchange HLE
-\*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) {
- __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
- : "+r" (_Value), "+m" (*_Target) :: "memory");
- return _Value;
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) {
- __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
- : "+r" (_Value), "+m" (*_Target) :: "memory");
- return _Value;
-}
-#endif
-#if defined(__x86_64__)
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) {
- __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}"
- : "+r" (_Value), "+m" (*_Target) :: "memory");
- return _Value;
-}
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) {
- __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}"
- : "+r" (_Value), "+m" (*_Target) :: "memory");
- return _Value;
-}
-#endif
-/*----------------------------------------------------------------------------*\
-|* Interlocked Compare Exchange HLE
-\*----------------------------------------------------------------------------*/
-#if defined(__i386__) || defined(__x86_64__)
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination,
- long _Exchange, long _Comparand) {
- __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
- : "+a" (_Comparand), "+m" (*_Destination)
- : "r" (_Exchange) : "memory");
- return _Comparand;
-}
-static __inline__ long __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange_HLERelease(long volatile *_Destination,
- long _Exchange, long _Comparand) {
- __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
- : "+a" (_Comparand), "+m" (*_Destination)
- : "r" (_Exchange) : "memory");
- return _Comparand;
-}
-#endif
-#if defined(__x86_64__)
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination,
- __int64 _Exchange, __int64 _Comparand) {
- __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}"
- : "+a" (_Comparand), "+m" (*_Destination)
- : "r" (_Exchange) : "memory");
- return _Comparand;
-}
-static __inline__ __int64 __DEFAULT_FN_ATTRS
-_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination,
- __int64 _Exchange, __int64 _Comparand) {
- __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}"
- : "+a" (_Comparand), "+m" (*_Destination)
- : "r" (_Exchange) : "memory");
- return _Comparand;
-}
-#endif
-#ifdef __cplusplus
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */
-
-#endif /* __IMMINTRIN_H */
+++ /dev/null
-/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <invpcidintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __INVPCIDINTRIN_H
-#define __INVPCIDINTRIN_H
-
-static __inline__ void
- __attribute__((__always_inline__, __nodebug__, __target__("invpcid")))
-_invpcid(unsigned int __type, void *__descriptor) {
- __builtin_ia32_invpcid(__type, __descriptor);
-}
-
-#endif /* __INVPCIDINTRIN_H */
+++ /dev/null
-/*===----------------- keylockerintrin.h - KL Intrinsics -------------------===
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <keylockerintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef _KEYLOCKERINTRIN_H
-#define _KEYLOCKERINTRIN_H
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__KL__)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("kl"),\
- __min_vector_width__(128)))
-
-/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl
-/// will assigned to EAX, whch specifies the KeySource and whether backing up
-/// the key is permitted. The 256-bit encryption key is loaded from the two
-/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is
-/// loaded from the implicit operand XMM0 which assigned by __intkey.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LOADIWKEY </c> instructions.
-///
-/// \operation
-/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode)
-/// GP (0)
-/// FI
-/// IF “LOADIWKEY exiting” VM execution control set
-/// VMexit
-/// FI
-/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used
-/// GP (0)
-/// FI
-/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set
-/// GP (0)
-/// FI
-/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part
-/// GP (0)
-/// FI
-/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part
-/// GP (0)
-/// FI
-/// IF (__ctl[4:1] == 0) // KeySource of 0.
-/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]:
-/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0]
-/// IWKey.IntegrityKey[127:0] := __intkey[127:0]
-/// IWKey.NoBackup := __ctl[0]
-/// IWKey.KeySource := __ctl[4:1]
-/// ZF := 0
-/// ELSE // KeySource of 1. See RDSEED definition for details of randomness
-/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received
-/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0]
-/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128]
-/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0]
-/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256]
-/// IWKey.NoBackup := __ctl[0]
-/// IWKey.KeySource := __ctl[4:1]
-/// ZF := 0
-/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded
-/// ZF := 1
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_loadiwkey (unsigned int __ctl, __m128i __intkey,
- __m128i __enkey_lo, __m128i __enkey_hi) {
- __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl);
-}
-
-/// Wrap a 128-bit AES key from __key into a key handle and output in
-/// ((__m128i*)__h) to ((__m128i*)__h) + 2 and a 32-bit value as return.
-/// The explicit source operand __htype specifies handle restrictions.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ENCODEKEY128 </c> instructions.
-///
-/// \operation
-/// InputKey[127:0] := __key[127:0]
-/// KeyMetadata[2:0] := __htype[2:0]
-/// KeyMetadata[23:3] := 0 // Reserved for future usage
-/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0)
-/// KeyMetadata[127:28] := 0 // Reserved for future usage
-/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0],
-/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
-/// dst[0] := IWKey.NoBackup
-/// dst[4:1] := IWKey.KeySource[3:0]
-/// dst[31:5] := 0
-/// MEM[__h+127:__h] := Handle[127:0] // AAD
-/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag
-/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText
-/// OF := 0
-/// SF := 0
-/// ZF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) {
- return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h);
-}
-
-/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then
-/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and
-/// a 32-bit value as return.
-/// The explicit source operand __htype specifies handle restrictions.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> ENCODEKEY256 </c> instructions.
-///
-/// \operation
-/// InputKey[127:0] := __key_lo[127:0]
-/// InputKey[255:128] := __key_hi[255:128]
-/// KeyMetadata[2:0] := __htype[2:0]
-/// KeyMetadata[23:3] := 0 // Reserved for future usage
-/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1)
-/// KeyMetadata[127:28] := 0 // Reserved for future usage
-/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0],
-/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0])
-/// dst[0] := IWKey.NoBackup
-/// dst[4:1] := IWKey.KeySource[3:0]
-/// dst[31:5] := 0
-/// MEM[__h+127:__h] := Handle[127:0] // AAD
-/// MEM[__h+255:__h+128] := Handle[255:128] // Tag
-/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0]
-/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128]
-/// OF := 0
-/// SF := 0
-/// ZF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi,
- void *__h) {
- return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo,
- (__v2di)__key_hi, __h);
-}
-
-/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using
-/// the 128-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENC128KL </c> instructions.
-///
-/// \operation
-/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
-/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[383:256] ||
-/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
-/// IF (IllegalHandle)
-/// ZF := 1
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-/// IF (Authentic == 0)
-/// ZF := 1
-/// ELSE
-/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey)
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
- return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using
-/// the 256-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENC256KL </c> instructions.
-///
-/// \operation
-/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic.
-/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[255:128] ||
-/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 )
-/// IF (IllegalHandle)
-/// ZF := 1
-/// MEM[__odata+127:__odata] := 0
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-/// IF (Authentic == 0)
-/// ZF := 1
-/// MEM[__odata+127:__odata] := 0
-/// ELSE
-/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey)
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
- return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using
-/// the 128-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDEC128KL </c> instructions.
-///
-/// \operation
-/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic.
-/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[383:256] ||
-/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128)
-/// IF (IllegalHandle)
-/// ZF := 1
-/// MEM[__odata+127:__odata] := 0
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-/// IF (Authentic == 0)
-/// ZF := 1
-/// MEM[__odata+127:__odata] := 0
-/// ELSE
-/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey)
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
- return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using
-/// the 256-bit key in the handle from the __h. It stores the result in the
-/// __odata. And return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDEC256KL </c> instructions.
-///
-/// \operation
-/// Handle[511:0] := MEM[__h+511:__h]
-/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[383:256] ||
-/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256)
-/// IF (IllegalHandle)
-/// ZF := 1
-/// MEM[__odata+127:__odata] := 0
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-/// IF (Authentic == 0)
-/// ZF := 1
-/// MEM[__odata+127:__odata] := 0
-/// ELSE
-/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey)
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) {
- return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
- || defined(__KL__) */
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__WIDEKL__)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\
- __min_vector_width__(128)))
-
-/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENCWIDE128KL </c> instructions.
-///
-/// \operation
-/// Handle := MEM[__h+383:__h]
-/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[255:128] ||
-/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 )
-/// IF (IllegalHandle)
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-/// IF Authentic == 0
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// FOR i := 0 to 7
-/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey)
-/// ENDFOR
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
- return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata,
- (const __v2di *)__idata, __h);
-}
-
-/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESENCWIDE256KL </c> instructions.
-///
-/// \operation
-/// Handle[511:0] := MEM[__h+511:__h]
-/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[255:128] ||
-/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 )
-/// IF (IllegalHandle)
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-/// IF Authentic == 0
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// FOR i := 0 to 7
-/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey)
-/// ENDFOR
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
- return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata,
- (const __v2di *)__idata, __h);
-}
-
-/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDECWIDE128KL </c> instructions.
-///
-/// \operation
-/// Handle[383:0] := MEM[__h+383:__h]
-/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[255:128] ||
-/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 )
-/// IF (IllegalHandle)
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey)
-/// IF Authentic == 0
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// FOR i := 0 to 7
-/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey)
-/// ENDFOR
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
- return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata,
- (const __v2di *)__idata, __h);
-}
-
-/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle
-/// at __h and store each resultant block back from __odata to __odata+7. And
-/// return the affected ZF flag status.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> AESDECWIDE256KL </c> instructions.
-///
-/// \operation
-/// Handle[511:0] := MEM[__h+511:__h]
-/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) ||
-/// (Handle[127:0] AND (CPL > 0)) ||
-/// Handle[255:128] ||
-/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 )
-/// If (IllegalHandle)
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey)
-/// IF Authentic == 0
-/// ZF := 1
-/// FOR i := 0 to 7
-/// __odata[i] := 0
-/// ENDFOR
-/// ELSE
-/// FOR i := 0 to 7
-/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey)
-/// ENDFOR
-/// ZF := 0
-/// FI
-/// FI
-/// dst := ZF
-/// OF := 0
-/// SF := 0
-/// AF := 0
-/// PF := 0
-/// CF := 0
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) {
- return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata,
- (const __v2di *)__idata, __h);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \
- || defined(__WIDEKL__) */
-
-#endif /* _KEYLOCKERINTRIN_H */
+++ /dev/null
-/*===---- lwpintrin.h - LWP intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <lwpintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __LWPINTRIN_H
-#define __LWPINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp")))
-
-/// Parses the LWPCB at the specified address and enables
-/// profiling if valid.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LLWPCB </c> instruction.
-///
-/// \param __addr
-/// Address to the new Lightweight Profiling Control Block (LWPCB). If the
-/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables
-/// Lightweight Profiling.
-static __inline__ void __DEFAULT_FN_ATTRS
-__llwpcb (void *__addr)
-{
- __builtin_ia32_llwpcb(__addr);
-}
-
-/// Flushes the LWP state to memory and returns the address of the LWPCB.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> SLWPCB </c> instruction.
-///
-/// \return
-/// Address to the current Lightweight Profiling Control Block (LWPCB).
-/// If LWP is not currently enabled, returns NULL.
-static __inline__ void* __DEFAULT_FN_ATTRS
-__slwpcb (void)
-{
- return __builtin_ia32_slwpcb();
-}
-
-/// Inserts programmed event record into the LWP event ring buffer
-/// and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
-///
-/// \param DATA2
-/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
-/// \param DATA1
-/// A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-/// A 32-bit immediate value is inserted into the 32-bit Flags field.
-/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
-/// the event record overwrites the last record in the buffer, the MissedEvents
-/// counter in the LWPCB is incremented, the head pointer is not advanced, and
-/// 1 is returned. Otherwise 0 is returned.
-#define __lwpins32(DATA2, DATA1, FLAGS) \
- (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \
- (unsigned int) (FLAGS)))
-
-/// Decrements the LWP programmed value sample event counter. If the result is
-/// negative, inserts an event record into the LWP event ring buffer in memory
-/// and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
-///
-/// \param DATA2
-/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field.
-/// \param DATA1
-/// A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-/// A 32-bit immediate value is inserted into the 32-bit Flags field.
-#define __lwpval32(DATA2, DATA1, FLAGS) \
- (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \
- (unsigned int) (FLAGS)))
-
-#ifdef __x86_64__
-
-/// Inserts programmed event record into the LWP event ring buffer
-/// and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPINS </c> instruction.
-///
-/// \param DATA2
-/// A 64-bit value is inserted into the 64-bit Data2 field.
-/// \param DATA1
-/// A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-/// A 32-bit immediate value is inserted into the 32-bit Flags field.
-/// \returns If the ring buffer is full and LWP is running in Synchronized Mode,
-/// the event record overwrites the last record in the buffer, the MissedEvents
-/// counter in the LWPCB is incremented, the head pointer is not advanced, and
-/// 1 is returned. Otherwise 0 is returned.
-#define __lwpins64(DATA2, DATA1, FLAGS) \
- (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
- (unsigned int) (FLAGS)))
-
-/// Decrements the LWP programmed value sample event counter. If the result is
-/// negative, inserts an event record into the LWP event ring buffer in memory
-/// and advances the ring buffer pointer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> LWPVAL </c> instruction.
-///
-/// \param DATA2
-/// A 64-bit value is and inserted into the 64-bit Data2 field.
-/// \param DATA1
-/// A 32-bit value is inserted into the 32-bit Data1 field.
-/// \param FLAGS
-/// A 32-bit immediate value is inserted into the 32-bit Flags field.
-#define __lwpval64(DATA2, DATA1, FLAGS) \
- (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \
- (unsigned int) (FLAGS)))
-
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __LWPINTRIN_H */
+++ /dev/null
-/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <lzcntintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __LZCNTINTRIN_H
-#define __LZCNTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt")))
-
-#ifndef _MSC_VER
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-/// An unsigned 16-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 16-bit integer containing the number of leading zero
-/// bits in the operand.
-#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X))
-#endif // _MSC_VER
-
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-/// An unsigned 32-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of leading zero
-/// bits in the operand.
-/// \see _lzcnt_u32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__lzcnt32(unsigned int __X)
-{
- return __builtin_ia32_lzcnt_u32(__X);
-}
-
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-/// An unsigned 32-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of leading zero
-/// bits in the operand.
-/// \see __lzcnt32
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_lzcnt_u32(unsigned int __X)
-{
- return __builtin_ia32_lzcnt_u32(__X);
-}
-
-#ifdef __x86_64__
-#ifndef _MSC_VER
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of leading zero
-/// bits in the operand.
-/// \see _lzcnt_u64
-#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X))
-#endif // _MSC_VER
-
-/// Counts the number of leading zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c LZCNT instruction.
-///
-/// \param __X
-/// An unsigned 64-bit integer whose leading zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of leading zero
-/// bits in the operand.
-/// \see __lzcnt64
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-_lzcnt_u64(unsigned long long __X)
-{
- return __builtin_ia32_lzcnt_u64(__X);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __LZCNTINTRIN_H */
+++ /dev/null
-/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef _MM3DNOW_H_INCLUDED
-#define _MM3DNOW_H_INCLUDED
-
-#include <mmintrin.h>
-#include <prfchwintrin.h>
-
-typedef float __v2sf __attribute__((__vector_size__(8)));
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64)))
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow")))
-_m_femms(void) {
- __builtin_ia32_femms();
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pavgusb(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pf2id(__m64 __m) {
- return (__m64)__builtin_ia32_pf2id((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfacc(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfadd(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpeq(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpge(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfcmpgt(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmax(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmin(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfmul(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcp(__m64 __m) {
- return (__m64)__builtin_ia32_pfrcp((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcpit1(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrcpit2(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrsqrt(__m64 __m) {
- return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfrsqrtit1(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfsub(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfsubr(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pi2fd(__m64 __m) {
- return (__m64)__builtin_ia32_pi2fd((__v2si)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pmulhrw(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/* Handle the 3dnowa instructions here. */
-#undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64)))
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pf2iw(__m64 __m) {
- return (__m64)__builtin_ia32_pf2iw((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfnacc(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pfpnacc(__m64 __m1, __m64 __m2) {
- return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pi2fw(__m64 __m) {
- return (__m64)__builtin_ia32_pi2fw((__v2si)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pswapdsf(__m64 __m) {
- return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m);
-}
-
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_m_pswapdsi(__m64 __m) {
- return (__m64)__builtin_ia32_pswapdsi((__v2si)__m);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __MM_MALLOC_H
-#define __MM_MALLOC_H
-
-#include <stdlib.h>
-
-#ifdef _WIN32
-#include <malloc.h>
-#else
-#ifndef __cplusplus
-extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
-#else
-// Some systems (e.g. those with GNU libc) declare posix_memalign with an
-// exception specifier. Via an "egregious workaround" in
-// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid
-// redeclaration of glibc's declaration.
-extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size);
-#endif
-#endif
-
-#if !(defined(_WIN32) && defined(_mm_malloc))
-static __inline__ void *__attribute__((__always_inline__, __nodebug__,
- __malloc__))
-_mm_malloc(size_t __size, size_t __align)
-{
- if (__align == 1) {
- return malloc(__size);
- }
-
- if (!(__align & (__align - 1)) && __align < sizeof(void *))
- __align = sizeof(void *);
-
- void *__mallocedMemory;
-#if defined(__MINGW32__)
- __mallocedMemory = __mingw_aligned_malloc(__size, __align);
-#elif defined(_WIN32)
- __mallocedMemory = _aligned_malloc(__size, __align);
-#else
- if (posix_memalign(&__mallocedMemory, __align, __size))
- return 0;
-#endif
-
- return __mallocedMemory;
-}
-
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_mm_free(void *__p)
-{
-#if defined(__MINGW32__)
- __mingw_aligned_free(__p);
-#elif defined(_WIN32)
- _aligned_free(__p);
-#else
- free(__p);
-#endif
-}
-#endif
-
-#endif /* __MM_MALLOC_H */
+++ /dev/null
-/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __MMINTRIN_H
-#define __MMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
-
-typedef long long __v1di __attribute__((__vector_size__(8)));
-typedef int __v2si __attribute__((__vector_size__(8)));
-typedef short __v4hi __attribute__((__vector_size__(8)));
-typedef char __v8qi __attribute__((__vector_size__(8)));
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
-
-/// Clears the MMX state by setting the state of the x87 stack registers
-/// to empty.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> EMMS </c> instruction.
-///
-static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
-_mm_empty(void)
-{
- __builtin_ia32_emms();
-}
-
-/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
-/// value of the 32-bit integer parameter and setting the upper 32 bits to 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVD </c> instruction.
-///
-/// \param __i
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
-/// parameter. The upper 32 bits are set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cvtsi32_si64(int __i)
-{
- return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
-}
-
-/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
-/// signed integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVD </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector.
-/// \returns A 32-bit signed integer value containing the lower 32 bits of the
-/// parameter.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtsi64_si32(__m64 __m)
-{
- return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
-}
-
-/// Casts a 64-bit signed integer value into a 64-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
-///
-/// \param __i
-/// A 64-bit signed integer.
-/// \returns A 64-bit integer vector containing the same bitwise pattern as the
-/// parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cvtsi64_m64(long long __i)
-{
- return (__m64)__i;
-}
-
-/// Casts a 64-bit integer vector into a 64-bit signed integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector.
-/// \returns A 64-bit signed integer containing the same bitwise pattern as the
-/// parameter.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvtm64_si64(__m64 __m)
-{
- return (long long)__m;
-}
-
-/// Converts 16-bit signed integers from both 64-bit integer vector
-/// parameters of [4 x i16] into 8-bit signed integer values, and constructs
-/// a 64-bit integer vector of [8 x i8] as the result. Positive values
-/// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
-/// are saturated to 0x80.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-/// 16-bit signed integer and is converted to an 8-bit signed integer with
-/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
-/// Negative values less than 0x80 are saturated to 0x80. The converted
-/// [4 x i8] values are written to the lower 32 bits of the result.
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-/// 16-bit signed integer and is converted to an 8-bit signed integer with
-/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
-/// Negative values less than 0x80 are saturated to 0x80. The converted
-/// [4 x i8] values are written to the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the converted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_packs_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Converts 32-bit signed integers from both 64-bit integer vector
-/// parameters of [2 x i32] into 16-bit signed integer values, and constructs
-/// a 64-bit integer vector of [4 x i16] as the result. Positive values
-/// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
-/// 0x8000 are saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-/// 32-bit signed integer and is converted to a 16-bit signed integer with
-/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-/// Negative values less than 0x8000 are saturated to 0x8000. The converted
-/// [2 x i16] values are written to the lower 32 bits of the result.
-/// \param __m2
-/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
-/// 32-bit signed integer and is converted to a 16-bit signed integer with
-/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
-/// Negative values less than 0x8000 are saturated to 0x8000. The converted
-/// [2 x i16] values are written to the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [4 x i16] containing the converted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_packs_pi32(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Converts 16-bit signed integers from both 64-bit integer vector
-/// parameters of [4 x i16] into 8-bit unsigned integer values, and
-/// constructs a 64-bit integer vector of [8 x i8] as the result. Values
-/// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
-/// to 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
-/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-/// than 0 are saturated to 0. The converted [4 x i8] values are written to
-/// the lower 32 bits of the result.
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
-/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
-/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
-/// than 0 are saturated to 0. The converted [4 x i8] values are written to
-/// the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the converted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_packs_pu16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
-/// and interleaves them into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8]. \n
-/// Bits [39:32] are written to bits [7:0] of the result. \n
-/// Bits [47:40] are written to bits [23:16] of the result. \n
-/// Bits [55:48] are written to bits [39:32] of the result. \n
-/// Bits [63:56] are written to bits [55:48] of the result.
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8].
-/// Bits [39:32] are written to bits [15:8] of the result. \n
-/// Bits [47:40] are written to bits [31:24] of the result. \n
-/// Bits [55:48] are written to bits [47:40] of the result. \n
-/// Bits [63:56] are written to bits [63:56] of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Unpacks the upper 32 bits from two 64-bit integer vectors of
-/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// Bits [47:32] are written to bits [15:0] of the result. \n
-/// Bits [63:48] are written to bits [47:32] of the result.
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// Bits [47:32] are written to bits [31:16] of the result. \n
-/// Bits [63:48] are written to bits [63:48] of the result.
-/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Unpacks the upper 32 bits from two 64-bit integer vectors of
-/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
-/// the lower 32 bits of the result.
-/// \param __m2
-/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
-/// the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
-/// and interleaves them into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8].
-/// Bits [7:0] are written to bits [7:0] of the result. \n
-/// Bits [15:8] are written to bits [23:16] of the result. \n
-/// Bits [23:16] are written to bits [39:32] of the result. \n
-/// Bits [31:24] are written to bits [55:48] of the result.
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8].
-/// Bits [7:0] are written to bits [15:8] of the result. \n
-/// Bits [15:8] are written to bits [31:24] of the result. \n
-/// Bits [23:16] are written to bits [47:40] of the result. \n
-/// Bits [31:24] are written to bits [63:56] of the result.
-/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Unpacks the lower 32 bits from two 64-bit integer vectors of
-/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// Bits [15:0] are written to bits [15:0] of the result. \n
-/// Bits [31:16] are written to bits [47:32] of the result.
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// Bits [15:0] are written to bits [31:16] of the result. \n
-/// Bits [31:16] are written to bits [63:48] of the result.
-/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Unpacks the lower 32 bits from two 64-bit integer vectors of
-/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
-/// the lower 32 bits of the result.
-/// \param __m2
-/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
-/// the upper 32 bits of the result.
-/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Adds each 8-bit integer element of the first 64-bit integer vector
-/// of [8 x i8] to the corresponding 8-bit integer element of the second
-/// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
-/// packed into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8].
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
-/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_add_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Adds each 16-bit integer element of the first 64-bit integer vector
-/// of [4 x i16] to the corresponding 16-bit integer element of the second
-/// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
-/// packed into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
-/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_add_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Adds each 32-bit integer element of the first 64-bit integer vector
-/// of [2 x i32] to the corresponding 32-bit integer element of the second
-/// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
-/// packed into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDD </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [2 x i32].
-/// \param __m2
-/// A 64-bit integer vector of [2 x i32].
-/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
-/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_add_pi32(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Adds each 8-bit signed integer element of the first 64-bit integer
-/// vector of [8 x i8] to the corresponding 8-bit signed integer element of
-/// the second 64-bit integer vector of [8 x i8]. Positive sums greater than
-/// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
-/// 0x80. The results are packed into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8].
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
-/// of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Adds each 16-bit signed integer element of the first 64-bit integer
-/// vector of [4 x i16] to the corresponding 16-bit signed integer element of
-/// the second 64-bit integer vector of [4 x i16]. Positive sums greater than
-/// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
-/// saturated to 0x8000. The results are packed into a 64-bit integer vector
-/// of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
-/// of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Adds each 8-bit unsigned integer element of the first 64-bit integer
-/// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
-/// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
-/// saturated to 0xFF. The results are packed into a 64-bit integer vector of
-/// [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8].
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
-/// unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pu8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Adds each 16-bit unsigned integer element of the first 64-bit integer
-/// vector of [4 x i16] to the corresponding 16-bit unsigned integer element
-/// of the second 64-bit integer vector of [4 x i16]. Sums greater than
-/// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
-/// integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
-/// unsigned sums of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_adds_pu16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Subtracts each 8-bit integer element of the second 64-bit integer
-/// vector of [8 x i8] from the corresponding 8-bit integer element of the
-/// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
-/// are packed into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8] containing the minuends.
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
-/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
-/// both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sub_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Subtracts each 16-bit integer element of the second 64-bit integer
-/// vector of [4 x i16] from the corresponding 16-bit integer element of the
-/// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
-/// results are packed into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16] containing the minuends.
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
-/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
-/// both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sub_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Subtracts each 32-bit integer element of the second 64-bit integer
-/// vector of [2 x i32] from the corresponding 32-bit integer element of the
-/// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
-/// results are packed into a 64-bit integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [2 x i32] containing the minuends.
-/// \param __m2
-/// A 64-bit integer vector of [2 x i32] containing the subtrahends.
-/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
-/// both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sub_pi32(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Subtracts each 8-bit signed integer element of the second 64-bit
-/// integer vector of [8 x i8] from the corresponding 8-bit signed integer
-/// element of the first 64-bit integer vector of [8 x i8]. Positive results
-/// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
-/// are saturated to 0x80. The results are packed into a 64-bit integer
-/// vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8] containing the minuends.
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
-/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Subtracts each 16-bit signed integer element of the second 64-bit
-/// integer vector of [4 x i16] from the corresponding 16-bit signed integer
-/// element of the first 64-bit integer vector of [4 x i16]. Positive results
-/// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
-/// 0x8000 are saturated to 0x8000. The results are packed into a 64-bit
-/// integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16] containing the minuends.
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
-/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Subtracts each 8-bit unsigned integer element of the second 64-bit
-/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
-/// element of the first 64-bit integer vector of [8 x i8].
-///
-/// If an element of the first vector is less than the corresponding element
-/// of the second vector, the result is saturated to 0. The results are
-/// packed into a 64-bit integer vector of [8 x i8].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8] containing the minuends.
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
-/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
-/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pu8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Subtracts each 16-bit unsigned integer element of the second 64-bit
-/// integer vector of [4 x i16] from the corresponding 16-bit unsigned
-/// integer element of the first 64-bit integer vector of [4 x i16].
-///
-/// If an element of the first vector is less than the corresponding element
-/// of the second vector, the result is saturated to 0. The results are
-/// packed into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16] containing the minuends.
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
-/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
-/// differences of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_subs_pu16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Multiplies each 16-bit signed integer element of the first 64-bit
-/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
-/// element of the second 64-bit integer vector of [4 x i16] and get four
-/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
-/// The lower 32 bits of these two sums are packed into a 64-bit integer
-/// vector of [2 x i32].
-///
-/// For example, bits [15:0] of both parameters are multiplied, bits [31:16]
-/// of both parameters are multiplied, and the sum of both results is written
-/// to bits [31:0] of the result.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
-/// products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_madd_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Multiplies each 16-bit signed integer element of the first 64-bit
-/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
-/// element of the second 64-bit integer vector of [4 x i16]. Packs the upper
-/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
-/// of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Multiplies each 16-bit signed integer element of the first 64-bit
-/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
-/// element of the second 64-bit integer vector of [4 x i16]. Packs the lower
-/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
-/// of the products of both parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_mullo_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Left-shifts each 16-bit signed integer element of the first
-/// parameter, which is a 64-bit integer vector of [4 x i16], by the number
-/// of bits specified by the second parameter, which is a 64-bit integer. The
-/// lower 16 bits of the results are packed into a 64-bit integer vector of
-/// [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [4 x i16].
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-/// values. If \a __count is greater or equal to 16, the result is set to all
-/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sll_pi16(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
-}
-
-/// Left-shifts each 16-bit signed integer element of a 64-bit integer
-/// vector of [4 x i16] by the number of bits specified by a 32-bit integer.
-/// The lower 16 bits of the results are packed into a 64-bit integer vector
-/// of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [4 x i16].
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
-/// values. If \a __count is greater or equal to 16, the result is set to all
-/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_slli_pi16(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
-}
-
-/// Left-shifts each 32-bit signed integer element of the first
-/// parameter, which is a 64-bit integer vector of [2 x i32], by the number
-/// of bits specified by the second parameter, which is a 64-bit integer. The
-/// lower 32 bits of the results are packed into a 64-bit integer vector of
-/// [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [2 x i32].
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-/// values. If \a __count is greater or equal to 32, the result is set to all
-/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sll_pi32(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
-}
-
-/// Left-shifts each 32-bit signed integer element of a 64-bit integer
-/// vector of [2 x i32] by the number of bits specified by a 32-bit integer.
-/// The lower 32 bits of the results are packed into a 64-bit integer vector
-/// of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [2 x i32].
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
-/// values. If \a __count is greater or equal to 32, the result is set to all
-/// 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_slli_pi32(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
-}
-
-/// Left-shifts the first 64-bit integer parameter by the number of bits
-/// specified by the second 64-bit integer parameter. The lower 64 bits of
-/// result are returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector containing the left-shifted value. If
-/// \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sll_si64(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
-}
-
-/// Left-shifts the first parameter, which is a 64-bit integer, by the
-/// number of bits specified by the second parameter, which is a 32-bit
-/// integer. The lower 64 bits of result are returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector containing the left-shifted value. If
-/// \a __count is greater or equal to 64, the result is set to 0.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_slli_si64(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of the first parameter,
-/// which is a 64-bit integer vector of [4 x i16], by the number of bits
-/// specified by the second parameter, which is a 64-bit integer.
-///
-/// High-order bits are filled with the sign bit of the initial value of each
-/// 16-bit element. The 16-bit results are packed into a 64-bit integer
-/// vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [4 x i16].
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sra_pi16(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of a 64-bit integer vector
-/// of [4 x i16] by the number of bits specified by a 32-bit integer.
-///
-/// High-order bits are filled with the sign bit of the initial value of each
-/// 16-bit element. The 16-bit results are packed into a 64-bit integer
-/// vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [4 x i16].
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srai_pi16(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of the first parameter,
-/// which is a 64-bit integer vector of [2 x i32], by the number of bits
-/// specified by the second parameter, which is a 64-bit integer.
-///
-/// High-order bits are filled with the sign bit of the initial value of each
-/// 32-bit element. The 32-bit results are packed into a 64-bit integer
-/// vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [2 x i32].
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sra_pi32(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of a 64-bit integer vector
-/// of [2 x i32] by the number of bits specified by a 32-bit integer.
-///
-/// High-order bits are filled with the sign bit of the initial value of each
-/// 32-bit element. The 32-bit results are packed into a 64-bit integer
-/// vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [2 x i32].
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srai_pi32(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of the first parameter,
-/// which is a 64-bit integer vector of [4 x i16], by the number of bits
-/// specified by the second parameter, which is a 64-bit integer.
-///
-/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
-/// integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [4 x i16].
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srl_pi16(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 16-bit integer element of a 64-bit integer vector
-/// of [4 x i16] by the number of bits specified by a 32-bit integer.
-///
-/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
-/// integer vector of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [4 x i16].
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srli_pi16(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of the first parameter,
-/// which is a 64-bit integer vector of [2 x i32], by the number of bits
-/// specified by the second parameter, which is a 64-bit integer.
-///
-/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
-/// integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [2 x i32].
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srl_pi32(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
-}
-
-/// Right-shifts each 32-bit integer element of a 64-bit integer vector
-/// of [2 x i32] by the number of bits specified by a 32-bit integer.
-///
-/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
-/// integer vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector of [2 x i32].
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srli_pi32(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
-}
-
-/// Right-shifts the first 64-bit integer parameter by the number of bits
-/// specified by the second 64-bit integer parameter.
-///
-/// High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srl_si64(__m64 __m, __m64 __count)
-{
- return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
-}
-
-/// Right-shifts the first parameter, which is a 64-bit integer, by the
-/// number of bits specified by the second parameter, which is a 32-bit
-/// integer.
-///
-/// High-order bits are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
-///
-/// \param __m
-/// A 64-bit integer vector interpreted as a single 64-bit integer.
-/// \param __count
-/// A 32-bit integer value.
-/// \returns A 64-bit integer vector containing the right-shifted value.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_srli_si64(__m64 __m, int __count)
-{
- return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
-}
-
-/// Performs a bitwise AND of two 64-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAND </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector.
-/// \param __m2
-/// A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise AND of both
-/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_and_si64(__m64 __m1, __m64 __m2)
-{
- return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Performs a bitwise NOT of the first 64-bit integer vector, and then
-/// performs a bitwise AND of the intermediate result and the second 64-bit
-/// integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PANDN </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector. The one's complement of this parameter is used
-/// in the bitwise AND.
-/// \param __m2
-/// A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise AND of the second
-/// parameter and the one's complement of the first parameter.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_andnot_si64(__m64 __m1, __m64 __m2)
-{
- return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Performs a bitwise OR of two 64-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POR </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector.
-/// \param __m2
-/// A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise OR of both
-/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_or_si64(__m64 __m1, __m64 __m2)
-{
- return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PXOR </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector.
-/// \param __m2
-/// A 64-bit integer vector.
-/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
-/// parameters.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_xor_si64(__m64 __m1, __m64 __m2)
-{
- return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
-}
-
-/// Compares the 8-bit integer elements of two 64-bit integer vectors of
-/// [8 x i8] to determine if the element of the first vector is equal to the
-/// corresponding element of the second vector.
-///
-/// The comparison yields 0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8].
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
-/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Compares the 16-bit integer elements of two 64-bit integer vectors of
-/// [4 x i16] to determine if the element of the first vector is equal to the
-/// corresponding element of the second vector.
-///
-/// The comparison yields 0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
-/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Compares the 32-bit integer elements of two 64-bit integer vectors of
-/// [2 x i32] to determine if the element of the first vector is equal to the
-/// corresponding element of the second vector.
-///
-/// The comparison yields 0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [2 x i32].
-/// \param __m2
-/// A 64-bit integer vector of [2 x i32].
-/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
-/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Compares the 8-bit integer elements of two 64-bit integer vectors of
-/// [8 x i8] to determine if the element of the first vector is greater than
-/// the corresponding element of the second vector.
-///
-/// The comparison yields 0 for false, 0xFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [8 x i8].
-/// \param __m2
-/// A 64-bit integer vector of [8 x i8].
-/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
-/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
-}
-
-/// Compares the 16-bit integer elements of two 64-bit integer vectors of
-/// [4 x i16] to determine if the element of the first vector is greater than
-/// the corresponding element of the second vector.
-///
-/// The comparison yields 0 for false, 0xFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [4 x i16].
-/// \param __m2
-/// A 64-bit integer vector of [4 x i16].
-/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
-/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
-}
-
-/// Compares the 32-bit integer elements of two 64-bit integer vectors of
-/// [2 x i32] to determine if the element of the first vector is greater than
-/// the corresponding element of the second vector.
-///
-/// The comparison yields 0 for false, 0xFFFFFFFF for true.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
-///
-/// \param __m1
-/// A 64-bit integer vector of [2 x i32].
-/// \param __m2
-/// A 64-bit integer vector of [2 x i32].
-/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
-/// results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
-{
- return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
-}
-
-/// Constructs a 64-bit integer vector initialized to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PXOR </c> instruction.
-///
-/// \returns An initialized 64-bit integer vector with all elements set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setzero_si64(void)
-{
- return __extension__ (__m64){ 0LL };
-}
-
-/// Constructs a 64-bit integer vector initialized with the specified
-/// 32-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i1
-/// A 32-bit integer value used to initialize the upper 32 bits of the
-/// result.
-/// \param __i0
-/// A 32-bit integer value used to initialize the lower 32 bits of the
-/// result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi32(int __i1, int __i0)
-{
- return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
-}
-
-/// Constructs a 64-bit integer vector initialized with the specified
-/// 16-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __s3
-/// A 16-bit integer value used to initialize bits [63:48] of the result.
-/// \param __s2
-/// A 16-bit integer value used to initialize bits [47:32] of the result.
-/// \param __s1
-/// A 16-bit integer value used to initialize bits [31:16] of the result.
-/// \param __s0
-/// A 16-bit integer value used to initialize bits [15:0] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
-{
- return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
-}
-
-/// Constructs a 64-bit integer vector initialized with the specified
-/// 8-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b7
-/// An 8-bit integer value used to initialize bits [63:56] of the result.
-/// \param __b6
-/// An 8-bit integer value used to initialize bits [55:48] of the result.
-/// \param __b5
-/// An 8-bit integer value used to initialize bits [47:40] of the result.
-/// \param __b4
-/// An 8-bit integer value used to initialize bits [39:32] of the result.
-/// \param __b3
-/// An 8-bit integer value used to initialize bits [31:24] of the result.
-/// \param __b2
-/// An 8-bit integer value used to initialize bits [23:16] of the result.
-/// \param __b1
-/// An 8-bit integer value used to initialize bits [15:8] of the result.
-/// \param __b0
-/// An 8-bit integer value used to initialize bits [7:0] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
- char __b1, char __b0)
-{
- return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
- __b4, __b5, __b6, __b7);
-}
-
-/// Constructs a 64-bit integer vector of [2 x i32], with each of the
-/// 32-bit integer vector elements set to the specified 32-bit integer
-/// value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i
-/// A 32-bit integer value used to initialize each vector element of the
-/// result.
-/// \returns An initialized 64-bit integer vector of [2 x i32].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi32(int __i)
-{
- return _mm_set_pi32(__i, __i);
-}
-
-/// Constructs a 64-bit integer vector of [4 x i16], with each of the
-/// 16-bit integer vector elements set to the specified 16-bit integer
-/// value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __w
-/// A 16-bit integer value used to initialize each vector element of the
-/// result.
-/// \returns An initialized 64-bit integer vector of [4 x i16].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi16(short __w)
-{
- return _mm_set_pi16(__w, __w, __w, __w);
-}
-
-/// Constructs a 64-bit integer vector of [8 x i8], with each of the
-/// 8-bit integer vector elements set to the specified 8-bit integer value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b
-/// An 8-bit integer value used to initialize each vector element of the
-/// result.
-/// \returns An initialized 64-bit integer vector of [8 x i8].
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_set1_pi8(char __b)
-{
- return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
-}
-
-/// Constructs a 64-bit integer vector, initialized in reverse order with
-/// the specified 32-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __i0
-/// A 32-bit integer value used to initialize the lower 32 bits of the
-/// result.
-/// \param __i1
-/// A 32-bit integer value used to initialize the upper 32 bits of the
-/// result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi32(int __i0, int __i1)
-{
- return _mm_set_pi32(__i1, __i0);
-}
-
-/// Constructs a 64-bit integer vector, initialized in reverse order with
-/// the specified 16-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __w0
-/// A 16-bit integer value used to initialize bits [15:0] of the result.
-/// \param __w1
-/// A 16-bit integer value used to initialize bits [31:16] of the result.
-/// \param __w2
-/// A 16-bit integer value used to initialize bits [47:32] of the result.
-/// \param __w3
-/// A 16-bit integer value used to initialize bits [63:48] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
-{
- return _mm_set_pi16(__w3, __w2, __w1, __w0);
-}
-
-/// Constructs a 64-bit integer vector, initialized in reverse order with
-/// the specified 8-bit integer values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __b0
-/// An 8-bit integer value used to initialize bits [7:0] of the result.
-/// \param __b1
-/// An 8-bit integer value used to initialize bits [15:8] of the result.
-/// \param __b2
-/// An 8-bit integer value used to initialize bits [23:16] of the result.
-/// \param __b3
-/// An 8-bit integer value used to initialize bits [31:24] of the result.
-/// \param __b4
-/// An 8-bit integer value used to initialize bits [39:32] of the result.
-/// \param __b5
-/// An 8-bit integer value used to initialize bits [47:40] of the result.
-/// \param __b6
-/// An 8-bit integer value used to initialize bits [55:48] of the result.
-/// \param __b7
-/// An 8-bit integer value used to initialize bits [63:56] of the result.
-/// \returns An initialized 64-bit integer vector.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
- char __b6, char __b7)
-{
- return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-/* Aliases for compatibility. */
-#define _m_empty _mm_empty
-#define _m_from_int _mm_cvtsi32_si64
-#define _m_from_int64 _mm_cvtsi64_m64
-#define _m_to_int _mm_cvtsi64_si32
-#define _m_to_int64 _mm_cvtm64_si64
-#define _m_packsswb _mm_packs_pi16
-#define _m_packssdw _mm_packs_pi32
-#define _m_packuswb _mm_packs_pu16
-#define _m_punpckhbw _mm_unpackhi_pi8
-#define _m_punpckhwd _mm_unpackhi_pi16
-#define _m_punpckhdq _mm_unpackhi_pi32
-#define _m_punpcklbw _mm_unpacklo_pi8
-#define _m_punpcklwd _mm_unpacklo_pi16
-#define _m_punpckldq _mm_unpacklo_pi32
-#define _m_paddb _mm_add_pi8
-#define _m_paddw _mm_add_pi16
-#define _m_paddd _mm_add_pi32
-#define _m_paddsb _mm_adds_pi8
-#define _m_paddsw _mm_adds_pi16
-#define _m_paddusb _mm_adds_pu8
-#define _m_paddusw _mm_adds_pu16
-#define _m_psubb _mm_sub_pi8
-#define _m_psubw _mm_sub_pi16
-#define _m_psubd _mm_sub_pi32
-#define _m_psubsb _mm_subs_pi8
-#define _m_psubsw _mm_subs_pi16
-#define _m_psubusb _mm_subs_pu8
-#define _m_psubusw _mm_subs_pu16
-#define _m_pmaddwd _mm_madd_pi16
-#define _m_pmulhw _mm_mulhi_pi16
-#define _m_pmullw _mm_mullo_pi16
-#define _m_psllw _mm_sll_pi16
-#define _m_psllwi _mm_slli_pi16
-#define _m_pslld _mm_sll_pi32
-#define _m_pslldi _mm_slli_pi32
-#define _m_psllq _mm_sll_si64
-#define _m_psllqi _mm_slli_si64
-#define _m_psraw _mm_sra_pi16
-#define _m_psrawi _mm_srai_pi16
-#define _m_psrad _mm_sra_pi32
-#define _m_psradi _mm_srai_pi32
-#define _m_psrlw _mm_srl_pi16
-#define _m_psrlwi _mm_srli_pi16
-#define _m_psrld _mm_srl_pi32
-#define _m_psrldi _mm_srli_pi32
-#define _m_psrlq _mm_srl_si64
-#define _m_psrlqi _mm_srli_si64
-#define _m_pand _mm_and_si64
-#define _m_pandn _mm_andnot_si64
-#define _m_por _mm_or_si64
-#define _m_pxor _mm_xor_si64
-#define _m_pcmpeqb _mm_cmpeq_pi8
-#define _m_pcmpeqw _mm_cmpeq_pi16
-#define _m_pcmpeqd _mm_cmpeq_pi32
-#define _m_pcmpgtb _mm_cmpgt_pi8
-#define _m_pcmpgtw _mm_cmpgt_pi16
-#define _m_pcmpgtd _mm_cmpgt_pi32
-
-#endif /* __MMINTRIN_H */
-
+++ /dev/null
-/*===------------------------- movdirintrin.h ------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <movdirintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef _MOVDIRINTRIN_H
-#define _MOVDIRINTRIN_H
-
-/* Move doubleword as direct store */
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
-_directstoreu_u32 (void *__dst, unsigned int __value)
-{
- __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value);
-}
-
-#ifdef __x86_64__
-
-/* Move quadword as direct store */
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__, __target__("movdiri")))
-_directstoreu_u64 (void *__dst, unsigned long __value)
-{
- __builtin_ia32_directstore_u64((unsigned long *)__dst, __value);
-}
-
-#endif /* __x86_64__ */
-
-/*
- * movdir64b - Move 64 bytes as direct store.
- * The destination must be 64 byte aligned, and the store is atomic.
- * The source address has no alignment requirement, and the load from
- * the source address is not atomic.
- */
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__, __target__("movdir64b")))
-_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src)
-{
- __builtin_ia32_movdir64b(__dst, __src);
-}
-
-#endif /* _MOVDIRINTRIN_H */
+++ /dev/null
-/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <mwaitxintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __MWAITXINTRIN_H
-#define __MWAITXINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx")))
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints)
-{
- __builtin_ia32_monitorx(__p, __extensions, __hints);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock)
-{
- __builtin_ia32_mwaitx(__extensions, __hints, __clock);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __MWAITXINTRIN_H */
+++ /dev/null
-/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __NMMINTRIN_H
-#define __NMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h,
- just include it now then. */
-#include <smmintrin.h>
-#endif /* __NMMINTRIN_H */
+++ /dev/null
-/*===---- pconfigintrin.h - X86 platform configuration ---------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <pconfigintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __PCONFIGINTRIN_H
-#define __PCONFIGINTRIN_H
-
-#define __PCONFIG_KEY_PROGRAM 0x00000001
-
-#if __has_extension(gnu_asm)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("pconfig")))
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
- unsigned int __result;
- __asm__ ("pconfig"
- : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
- : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
- : "cc");
- return __result;
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __has_extension(gnu_asm) */
-
-#endif
+++ /dev/null
-/*===---- pkuintrin.h - PKU intrinsics -------------------------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <pkuintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __PKUINTRIN_H
-#define __PKUINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku")))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_rdpkru_u32(void)
-{
- return __builtin_ia32_rdpkru();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_wrpkru(unsigned int __val)
-{
- __builtin_ia32_wrpkru(__val);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __PMMINTRIN_H
-#define __PMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <emmintrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128)))
-
-/// Loads data from an unaligned memory location to elements in a 128-bit
-/// vector.
-///
-/// If the address of the data is not 16-byte aligned, the instruction may
-/// read two adjacent aligned blocks of memory to retrieve the requested
-/// data.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit integer vector containing integer values.
-/// \returns A 128-bit vector containing the moved values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_lddqu_si128(__m128i const *__p)
-{
- return (__m128i)__builtin_ia32_lddqu((char const *)__p);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-/// two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the left source operand.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing the right source operand.
-/// \returns A 128-bit vector of [4 x float] containing the alternating sums and
-/// differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_addsub_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in two
-/// 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The horizontal sums of the values are stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The horizontal sums of the values are stored in the upper bits of the
-/// destination.
-/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of
-/// both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hadd_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in two
-/// 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The horizontal differences between the values are stored in the lower
-/// bits of the destination.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The horizontal differences between the values are stored in the upper
-/// bits of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the horizontal
-/// differences of both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_hsub_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Moves and duplicates odd-indexed values from a 128-bit vector
-/// of [4 x float] to float values stored in a 128-bit vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. \n
-/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of
-/// the destination. \n
-/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the
-/// destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
-/// values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehdup_ps(__m128 __a)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3);
-}
-
-/// Duplicates even-indexed values from a 128-bit vector of
-/// [4 x float] to float values stored in a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] \n
-/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of
-/// the destination. \n
-/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the
-/// destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated
-/// values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_moveldup_ps(__m128 __a)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2);
-}
-
-/// Adds the even-indexed values and subtracts the odd-indexed values of
-/// two 128-bit vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing the left source operand.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing the right source operand.
-/// \returns A 128-bit vector of [2 x double] containing the alternating sums
-/// and differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_addsub_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Horizontally adds the pairs of values contained in two 128-bit
-/// vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// The horizontal sum of the values is stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// The horizontal sum of the values is stored in the upper bits of the
-/// destination.
-/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of
-/// both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hadd_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Horizontally subtracts the pairs of values contained in two 128-bit
-/// vectors of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// The horizontal difference of the values is stored in the lower bits of
-/// the destination.
-/// \param __b
-/// A 128-bit vector of [2 x double] containing one of the source operands.
-/// The horizontal difference of the values is stored in the upper bits of
-/// the destination.
-/// \returns A 128-bit vector of [2 x double] containing the horizontal
-/// differences of both operands.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_hsub_pd(__m128d __a, __m128d __b)
-{
- return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b);
-}
-
-/// Moves and duplicates one double-precision value to double-precision
-/// values stored in a 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_loaddup_pd(double const *dp);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
-///
-/// \param dp
-/// A pointer to a double-precision value to be moved and duplicated.
-/// \returns A 128-bit vector of [2 x double] containing the moved and
-/// duplicated values.
-#define _mm_loaddup_pd(dp) _mm_load1_pd(dp)
-
-/// Moves and duplicates the double-precision value in the lower bits of
-/// a 128-bit vector of [2 x double] to double-precision values stored in a
-/// 128-bit vector of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits
-/// [127:64] and [63:0] of the destination.
-/// \returns A 128-bit vector of [2 x double] containing the moved and
-/// duplicated values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_movedup_pd(__m128d __a)
-{
- return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
-}
-
-/// Establishes a linear address memory range to be monitored and puts
-/// the processor in the monitor event pending state. Data stored in the
-/// monitored address range causes the processor to exit the pending state.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MONITOR </c> instruction.
-///
-/// \param __p
-/// The memory range to be monitored. The size of the range is determined by
-/// CPUID function 0000_0005h.
-/// \param __extensions
-/// Optional extensions for the monitoring state.
-/// \param __hints
-/// Optional hints for the monitoring state.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints)
-{
- __builtin_ia32_monitor(__p, __extensions, __hints);
-}
-
-/// Used with the MONITOR instruction to wait while the processor is in
-/// the monitor event pending state. Data stored in the monitored address
-/// range causes the processor to exit the pending state.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MWAIT </c> instruction.
-///
-/// \param __extensions
-/// Optional extensions for the monitoring state, which may vary by
-/// processor.
-/// \param __hints
-/// Optional hints for the monitoring state, which may vary by processor.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_mwait(unsigned __extensions, unsigned __hints)
-{
- __builtin_ia32_mwait(__extensions, __hints);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __PMMINTRIN_H */
+++ /dev/null
-/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __POPCNTINTRIN_H
-#define __POPCNTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt")))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
-#else
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
-#endif
-
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
-///
-/// \param __A
-/// An unsigned 32-bit integer operand.
-/// \returns A 32-bit integer containing the number of bits with value 1 in the
-/// source operand.
-static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_popcnt_u32(unsigned int __A)
-{
- return __builtin_popcount(__A);
-}
-
-#ifdef __x86_64__
-/// Counts the number of bits in the source operand having a value of 1.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> POPCNT </c> instruction.
-///
-/// \param __A
-/// An unsigned 64-bit integer operand.
-/// \returns A 64-bit integer containing the number of bits with value 1 in the
-/// source operand.
-static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm_popcnt_u64(unsigned long long __A)
-{
- return __builtin_popcountll(__A);
-}
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR
-
-#endif /* __POPCNTINTRIN_H */
+++ /dev/null
-/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED)
-#error "Never use <prfchwintrin.h> directly; include <x86intrin.h> or <mm3dnow.h> instead."
-#endif
-
-#ifndef __PRFCHWINTRIN_H
-#define __PRFCHWINTRIN_H
-
-/// Loads a memory sequence containing the specified memory address into
-/// all data cache levels. The cache-coherency state is set to exclusive.
-/// Data can be read from and written to the cache line without additional
-/// delay.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PREFETCHT0 instruction.
-///
-/// \param __P
-/// A pointer specifying the memory address to be prefetched.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_m_prefetch(void *__P)
-{
- __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
-}
-
-/// Loads a memory sequence containing the specified memory address into
-/// the L1 data cache and sets the cache-coherency to modified. This
-/// provides a hint to the processor that the cache line will be modified.
-/// It is intended for use when the cache line will be written to shortly
-/// after the prefetch is performed.
-///
-/// Note that the effect of this intrinsic is dependent on the processor
-/// implementation.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PREFETCHW instruction.
-///
-/// \param __P
-/// A pointer specifying the memory address to be prefetched.
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
-_m_prefetchw(volatile const void *__P)
-{
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wcast-qual"
- __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */);
-#pragma clang diagnostic pop
-}
-
-#endif /* __PRFCHWINTRIN_H */
+++ /dev/null
-/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <ptwriteintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __PTWRITEINTRIN_H
-#define __PTWRITEINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("ptwrite")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_ptwrite32(unsigned int __value) {
- __builtin_ia32_ptwrite32(__value);
-}
-
-#ifdef __x86_64__
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_ptwrite64(unsigned long long __value) {
- __builtin_ia32_ptwrite64(__value);
-}
-
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __PTWRITEINTRIN_H */
+++ /dev/null
-/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <rdseedintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __RDSEEDINTRIN_H
-#define __RDSEEDINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed")))
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_rdseed16_step(unsigned short *__p)
-{
- return __builtin_ia32_rdseed16_step(__p);
-}
-
-static __inline__ int __DEFAULT_FN_ATTRS
-_rdseed32_step(unsigned int *__p)
-{
- return __builtin_ia32_rdseed32_step(__p);
-}
-
-#ifdef __x86_64__
-static __inline__ int __DEFAULT_FN_ATTRS
-_rdseed64_step(unsigned long long *__p)
-{
- return __builtin_ia32_rdseed64_step(__p);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __RDSEEDINTRIN_H */
+++ /dev/null
-/*===---- rtmintrin.h - RTM intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <rtmintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __RTMINTRIN_H
-#define __RTMINTRIN_H
-
-#define _XBEGIN_STARTED (~0u)
-#define _XABORT_EXPLICIT (1 << 0)
-#define _XABORT_RETRY (1 << 1)
-#define _XABORT_CONFLICT (1 << 2)
-#define _XABORT_CAPACITY (1 << 3)
-#define _XABORT_DEBUG (1 << 4)
-#define _XABORT_NESTED (1 << 5)
-#define _XABORT_CODE(x) (((x) >> 24) & 0xFF)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-_xbegin(void)
-{
- return __builtin_ia32_xbegin();
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xend(void)
-{
- __builtin_ia32_xend();
-}
-
-#define _xabort(imm) __builtin_ia32_xabort((imm))
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __RTMINTRIN_H */
+++ /dev/null
-/*===--------------- serializeintrin.h - serialize intrinsics --------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <serializeintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __SERIALIZEINTRIN_H
-#define __SERIALIZEINTRIN_H
-
-/// Serialize instruction fetch and execution.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> SERIALIZE </c> instruction.
-///
-static __inline__ void
-__attribute__((__always_inline__, __nodebug__, __target__("serialize")))
-_serialize (void)
-{
- __builtin_ia32_serialize ();
-}
-
-#endif /* __SERIALIZEINTRIN_H */
+++ /dev/null
-/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <sgxintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __SGXINTRIN_H
-#define __SGXINTRIN_H
-
-#if __has_extension(gnu_asm)
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("sgx")))
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
- unsigned int __result;
- __asm__ ("enclu"
- : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
- : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
- : "cc");
- return __result;
-}
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
- unsigned int __result;
- __asm__ ("encls"
- : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
- : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
- : "cc");
- return __result;
-}
-
-static __inline unsigned int __DEFAULT_FN_ATTRS
-_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[])
-{
- unsigned int __result;
- __asm__ ("enclv"
- : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2])
- : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2])
- : "cc");
- return __result;
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __has_extension(gnu_asm) */
-
-#endif
+++ /dev/null
-/*===---- shaintrin.h - SHA intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __SHAINTRIN_H
-#define __SHAINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128)))
-
-#define _mm_sha1rnds4_epu32(V1, V2, M) \
- __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha1nexte_epu32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha1msg1_epu32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha1msg2_epu32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z)
-{
- return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha256msg1_epu32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha256msg2_epu32(__m128i __X, __m128i __Y)
-{
- return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __SHAINTRIN_H */
+++ /dev/null
-/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __SMMINTRIN_H
-#define __SMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <tmmintrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
-
-/* SSE4 Rounding macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-
-#define _MM_FROUND_RAISE_EXC 0x00
-#define _MM_FROUND_NO_EXC 0x08
-
-#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT)
-#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF)
-#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF)
-#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO)
-#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
-#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
-
-/// Rounds up each element of the 128-bit vector of [4 x float] to an
-/// integer and returns the rounded values in a 128-bit vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_ceil_ps(__m128 X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float] values to be rounded up.
-/// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
-
-/// Rounds up each element of the 128-bit vector of [2 x double] to an
-/// integer and returns the rounded values in a 128-bit vector of
-/// [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_ceil_pd(__m128d X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [2 x double] values to be rounded up.
-/// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
-
-/// Copies three upper elements of the first 128-bit vector operand to
-/// the corresponding three upper elements of the 128-bit result vector of
-/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
-/// operand to an integer and copies it to the lowest element of the 128-bit
-/// result vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
-/// copied to the corresponding bits of the result.
-/// \param Y
-/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
-/// rounded up to the nearest integer and copied to the corresponding bits
-/// of the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
-/// values.
-#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
-
-/// Copies the upper element of the first 128-bit vector operand to the
-/// corresponding upper element of the 128-bit result vector of [2 x double].
-/// Rounds up the lower element of the second 128-bit vector operand to an
-/// integer and copies it to the lower element of the 128-bit result vector
-/// of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
-/// copied to the corresponding bits of the result.
-/// \param Y
-/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
-/// rounded up to the nearest integer and copied to the corresponding bits
-/// of the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
-/// values.
-#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
-
-/// Rounds down each element of the 128-bit vector of [4 x float] to an
-/// an integer and returns the rounded values in a 128-bit vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_floor_ps(__m128 X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float] values to be rounded down.
-/// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
-
-/// Rounds down each element of the 128-bit vector of [2 x double] to an
-/// integer and returns the rounded values in a 128-bit vector of
-/// [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_floor_pd(__m128d X);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [2 x double].
-/// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
-
-/// Copies three upper elements of the first 128-bit vector operand to
-/// the corresponding three upper elements of the 128-bit result vector of
-/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
-/// operand to an integer and copies it to the lowest element of the 128-bit
-/// result vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
-/// copied to the corresponding bits of the result.
-/// \param Y
-/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
-/// rounded down to the nearest integer and copied to the corresponding bits
-/// of the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
-/// values.
-#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
-
-/// Copies the upper element of the first 128-bit vector operand to the
-/// corresponding upper element of the 128-bit result vector of [2 x double].
-/// Rounds down the lower element of the second 128-bit vector operand to an
-/// integer and copies it to the lower element of the 128-bit result vector
-/// of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
-/// copied to the corresponding bits of the result.
-/// \param Y
-/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
-/// rounded down to the nearest integer and copied to the corresponding bits
-/// of the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
-/// values.
-#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
-
-/// Rounds each element of the 128-bit vector of [4 x float] to an
-/// integer value according to the rounding control specified by the second
-/// argument and returns the rounded values in a 128-bit vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_round_ps(__m128 X, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float].
-/// \param M
-/// An integer value that specifies the rounding operation. \n
-/// Bits [7:4] are reserved. \n
-/// Bit [3] is a precision exception value: \n
-/// 0: A normal PE exception is used \n
-/// 1: The PE field is not updated \n
-/// Bit [2] is the rounding control source: \n
-/// 0: Use bits [1:0] of \a M \n
-/// 1: Use the current MXCSR setting \n
-/// Bits [1:0] contain the rounding control definition: \n
-/// 00: Nearest \n
-/// 01: Downward (toward negative infinity) \n
-/// 10: Upward (toward positive infinity) \n
-/// 11: Truncated
-/// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_round_ps(X, M) \
- ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
-
-/// Copies three upper elements of the first 128-bit vector operand to
-/// the corresponding three upper elements of the 128-bit result vector of
-/// [4 x float]. Rounds the lowest element of the second 128-bit vector
-/// operand to an integer value according to the rounding control specified
-/// by the third argument and copies it to the lowest element of the 128-bit
-/// result vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
-/// copied to the corresponding bits of the result.
-/// \param Y
-/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
-/// rounded to the nearest integer using the specified rounding control and
-/// copied to the corresponding bits of the result.
-/// \param M
-/// An integer value that specifies the rounding operation. \n
-/// Bits [7:4] are reserved. \n
-/// Bit [3] is a precision exception value: \n
-/// 0: A normal PE exception is used \n
-/// 1: The PE field is not updated \n
-/// Bit [2] is the rounding control source: \n
-/// 0: Use bits [1:0] of \a M \n
-/// 1: Use the current MXCSR setting \n
-/// Bits [1:0] contain the rounding control definition: \n
-/// 00: Nearest \n
-/// 01: Downward (toward negative infinity) \n
-/// 10: Upward (toward positive infinity) \n
-/// 11: Truncated
-/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
-/// values.
-#define _mm_round_ss(X, Y, M) \
- ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (M)))
-
-/// Rounds each element of the 128-bit vector of [2 x double] to an
-/// integer value according to the rounding control specified by the second
-/// argument and returns the rounded values in a 128-bit vector of
-/// [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_round_pd(__m128d X, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [2 x double].
-/// \param M
-/// An integer value that specifies the rounding operation. \n
-/// Bits [7:4] are reserved. \n
-/// Bit [3] is a precision exception value: \n
-/// 0: A normal PE exception is used \n
-/// 1: The PE field is not updated \n
-/// Bit [2] is the rounding control source: \n
-/// 0: Use bits [1:0] of \a M \n
-/// 1: Use the current MXCSR setting \n
-/// Bits [1:0] contain the rounding control definition: \n
-/// 00: Nearest \n
-/// 01: Downward (toward negative infinity) \n
-/// 10: Upward (toward positive infinity) \n
-/// 11: Truncated
-/// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_round_pd(X, M) \
- ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
-
-/// Copies the upper element of the first 128-bit vector operand to the
-/// corresponding upper element of the 128-bit result vector of [2 x double].
-/// Rounds the lower element of the second 128-bit vector operand to an
-/// integer value according to the rounding control specified by the third
-/// argument and copies it to the lower element of the 128-bit result vector
-/// of [2 x double].
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
-/// copied to the corresponding bits of the result.
-/// \param Y
-/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
-/// rounded to the nearest integer using the specified rounding control and
-/// copied to the corresponding bits of the result.
-/// \param M
-/// An integer value that specifies the rounding operation. \n
-/// Bits [7:4] are reserved. \n
-/// Bit [3] is a precision exception value: \n
-/// 0: A normal PE exception is used \n
-/// 1: The PE field is not updated \n
-/// Bit [2] is the rounding control source: \n
-/// 0: Use bits [1:0] of \a M \n
-/// 1: Use the current MXCSR setting \n
-/// Bits [1:0] contain the rounding control definition: \n
-/// 00: Nearest \n
-/// 01: Downward (toward negative infinity) \n
-/// 10: Upward (toward positive infinity) \n
-/// 11: Truncated
-/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
-/// values.
-#define _mm_round_sd(X, Y, M) \
- ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (M)))
-
-/* SSE4 Packed Blending Intrinsics. */
-/// Returns a 128-bit vector of [2 x double] where the values are
-/// selected from either the first or second operand as specified by the
-/// third operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
-///
-/// \param V1
-/// A 128-bit vector of [2 x double].
-/// \param V2
-/// A 128-bit vector of [2 x double].
-/// \param M
-/// An immediate integer operand, with mask bits [1:0] specifying how the
-/// values are to be copied. The position of the mask bit corresponds to the
-/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
-/// element in operand \a V1 is copied to the same position in the result.
-/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
-/// is copied to the same position in the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_blend_pd(V1, V2, M) \
- ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
- (__v2df)(__m128d)(V2), (int)(M)))
-
-/// Returns a 128-bit vector of [4 x float] where the values are selected
-/// from either the first or second operand as specified by the third
-/// operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
-///
-/// \param V1
-/// A 128-bit vector of [4 x float].
-/// \param V2
-/// A 128-bit vector of [4 x float].
-/// \param M
-/// An immediate integer operand, with mask bits [3:0] specifying how the
-/// values are to be copied. The position of the mask bit corresponds to the
-/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
-/// element in operand \a V1 is copied to the same position in the result.
-/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
-/// is copied to the same position in the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_blend_ps(V1, V2, M) \
- ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
- (__v4sf)(__m128)(V2), (int)(M)))
-
-/// Returns a 128-bit vector of [2 x double] where the values are
-/// selected from either the first or second operand as specified by the
-/// third operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [2 x double].
-/// \param __V2
-/// A 128-bit vector of [2 x double].
-/// \param __M
-/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
-/// values are to be copied. The position of the mask bit corresponds to the
-/// most significant bit of a copied value. When a mask bit is 0, the
-/// corresponding 64-bit element in operand \a __V1 is copied to the same
-/// position in the result. When a mask bit is 1, the corresponding 64-bit
-/// element in operand \a __V2 is copied to the same position in the result.
-/// \returns A 128-bit vector of [2 x double] containing the copied values.
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
-{
- return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2,
- (__v2df)__M);
-}
-
-/// Returns a 128-bit vector of [4 x float] where the values are
-/// selected from either the first or second operand as specified by the
-/// third operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [4 x float].
-/// \param __V2
-/// A 128-bit vector of [4 x float].
-/// \param __M
-/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
-/// how the values are to be copied. The position of the mask bit corresponds
-/// to the most significant bit of a copied value. When a mask bit is 0, the
-/// corresponding 32-bit element in operand \a __V1 is copied to the same
-/// position in the result. When a mask bit is 1, the corresponding 32-bit
-/// element in operand \a __V2 is copied to the same position in the result.
-/// \returns A 128-bit vector of [4 x float] containing the copied values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
-{
- return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2,
- (__v4sf)__M);
-}
-
-/// Returns a 128-bit vector of [16 x i8] where the values are selected
-/// from either of the first or second operand as specified by the third
-/// operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [16 x i8].
-/// \param __V2
-/// A 128-bit vector of [16 x i8].
-/// \param __M
-/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
-/// how the values are to be copied. The position of the mask bit corresponds
-/// to the most significant bit of a copied value. When a mask bit is 0, the
-/// corresponding 8-bit element in operand \a __V1 is copied to the same
-/// position in the result. When a mask bit is 1, the corresponding 8-bit
-/// element in operand \a __V2 is copied to the same position in the result.
-/// \returns A 128-bit vector of [16 x i8] containing the copied values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
-{
- return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2,
- (__v16qi)__M);
-}
-
-/// Returns a 128-bit vector of [8 x i16] where the values are selected
-/// from either of the first or second operand as specified by the third
-/// operand, the control mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
-///
-/// \param V1
-/// A 128-bit vector of [8 x i16].
-/// \param V2
-/// A 128-bit vector of [8 x i16].
-/// \param M
-/// An immediate integer operand, with mask bits [7:0] specifying how the
-/// values are to be copied. The position of the mask bit corresponds to the
-/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
-/// element in operand \a V1 is copied to the same position in the result.
-/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
-/// is copied to the same position in the result.
-/// \returns A 128-bit vector of [8 x i16] containing the copied values.
-#define _mm_blend_epi16(V1, V2, M) \
- ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
- (__v8hi)(__m128i)(V2), (int)(M)))
-
-/* SSE4 Dword Multiply Instructions. */
-/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
-/// and returns the lower 32 bits of the each product in a 128-bit vector of
-/// [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
-///
-/// \param __V1
-/// A 128-bit integer vector.
-/// \param __V2
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mullo_epi32 (__m128i __V1, __m128i __V2)
-{
- return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
-}
-
-/// Multiplies corresponding even-indexed elements of two 128-bit
-/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
-/// containing the products.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [4 x i32].
-/// \param __V2
-/// A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [2 x i64] containing the products of both
-/// operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mul_epi32 (__m128i __V1, __m128i __V2)
-{
- return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2);
-}
-
-/* SSE4 Floating Point Dot Product Instructions. */
-/// Computes the dot product of the two 128-bit vectors of [4 x float]
-/// and returns it in the elements of the 128-bit result vector of
-/// [4 x float].
-///
-/// The immediate integer operand controls which input elements
-/// will contribute to the dot product, and where the final results are
-/// returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float].
-/// \param Y
-/// A 128-bit vector of [4 x float].
-/// \param M
-/// An immediate integer operand. Mask bits [7:4] determine which elements
-/// of the input vectors are used, with bit [4] corresponding to the lowest
-/// element and bit [7] corresponding to the highest element of each [4 x
-/// float] vector. If a bit is set, the corresponding elements from the two
-/// input vectors are used as an input for dot product; otherwise that input
-/// is treated as zero. Bits [3:0] determine which elements of the result
-/// will receive a copy of the final dot product, with bit [0] corresponding
-/// to the lowest element and bit [3] corresponding to the highest element of
-/// each [4 x float] subvector. If a bit is set, the dot product is returned
-/// in the corresponding element; otherwise that element is set to zero.
-/// \returns A 128-bit vector of [4 x float] containing the dot product.
-#define _mm_dp_ps(X, Y, M) \
- ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
- (__v4sf)(__m128)(Y), (M)))
-
-/// Computes the dot product of the two 128-bit vectors of [2 x double]
-/// and returns it in the elements of the 128-bit result vector of
-/// [2 x double].
-///
-/// The immediate integer operand controls which input
-/// elements will contribute to the dot product, and where the final results
-/// are returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [2 x double].
-/// \param Y
-/// A 128-bit vector of [2 x double].
-/// \param M
-/// An immediate integer operand. Mask bits [5:4] determine which elements
-/// of the input vectors are used, with bit [4] corresponding to the lowest
-/// element and bit [5] corresponding to the highest element of each of [2 x
-/// double] vector. If a bit is set, the corresponding elements from the two
-/// input vectors are used as an input for dot product; otherwise that input
-/// is treated as zero. Bits [1:0] determine which elements of the result
-/// will receive a copy of the final dot product, with bit [0] corresponding
-/// to the lowest element and bit [1] corresponding to the highest element of
-/// each [2 x double] vector. If a bit is set, the dot product is returned in
-/// the corresponding element; otherwise that element is set to zero.
-#define _mm_dp_pd(X, Y, M) \
- ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), (M)))
-
-/* SSE4 Streaming Load Hint Instruction. */
-/// Loads integer values from a 128-bit aligned memory location to a
-/// 128-bit integer vector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
-///
-/// \param __V
-/// A pointer to a 128-bit aligned memory location that contains the integer
-/// values.
-/// \returns A 128-bit integer vector containing the data stored at the
-/// specified memory location.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128 (__m128i const *__V)
-{
- return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V);
-}
-
-/* SSE4 Packed Integer Min/Max Instructions. */
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
-/// of the two values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [16 x i8].
-/// \param __V2
-/// A 128-bit vector of [16 x i8]
-/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_min_epi8 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
-#else
- return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2);
-#endif
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
-/// greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [16 x i8].
-/// \param __V2
-/// A 128-bit vector of [16 x i8].
-/// \returns A 128-bit vector of [16 x i8] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_max_epi8 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
-#else
- return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2);
-#endif
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
-/// value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [8 x u16].
-/// \param __V2
-/// A 128-bit vector of [8 x u16].
-/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_min_epu16 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
-#else
- return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2);
-#endif
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
-/// greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [8 x u16].
-/// \param __V2
-/// A 128-bit vector of [8 x u16].
-/// \returns A 128-bit vector of [8 x u16] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_max_epu16 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
-#else
- return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2);
-#endif
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
-/// value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [4 x i32].
-/// \param __V2
-/// A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_min_epi32 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
-#else
- return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2);
-#endif
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
-/// greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [4 x i32].
-/// \param __V2
-/// A 128-bit vector of [4 x i32].
-/// \returns A 128-bit vector of [4 x i32] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_max_epi32 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
-#else
- return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2);
-#endif
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
-/// value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [4 x u32].
-/// \param __V2
-/// A 128-bit vector of [4 x u32].
-/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_min_epu32 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
-#else
- return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2);
-#endif
-}
-
-/// Compares the corresponding elements of two 128-bit vectors of
-/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
-/// greater value of the two.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [4 x u32].
-/// \param __V2
-/// A 128-bit vector of [4 x u32].
-/// \returns A 128-bit vector of [4 x u32] containing the greater values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_max_epu32 (__m128i __V1, __m128i __V2)
-{
-#if (__clang_major__ < 14)
- return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2);
-#else
- return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2);
-#endif
-}
-
-/* SSE4 Insertion and Extraction from XMM Register Instructions. */
-/// Takes the first argument \a X and inserts an element from the second
-/// argument \a Y as selected by the third argument \a N. That result then
-/// has elements zeroed out also as selected by the third argument \a N. The
-/// resulting 128-bit vector of [4 x float] is then returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
-///
-/// \param X
-/// A 128-bit vector source operand of [4 x float]. With the exception of
-/// those bits in the result copied from parameter \a Y and zeroed by bits
-/// [3:0] of \a N, all bits from this parameter are copied to the result.
-/// \param Y
-/// A 128-bit vector source operand of [4 x float]. One single-precision
-/// floating-point element from this source, as determined by the immediate
-/// parameter, is copied to the result.
-/// \param N
-/// Specifies which bits from operand \a Y will be copied, which bits in the
-/// result they will be be copied to, and which bits in the result will be
-/// cleared. The following assignments are made: \n
-/// Bits [7:6] specify the bits to copy from operand \a Y: \n
-/// 00: Selects bits [31:0] from operand \a Y. \n
-/// 01: Selects bits [63:32] from operand \a Y. \n
-/// 10: Selects bits [95:64] from operand \a Y. \n
-/// 11: Selects bits [127:96] from operand \a Y. \n
-/// Bits [5:4] specify the bits in the result to which the selected bits
-/// from operand \a Y are copied: \n
-/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
-/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
-/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
-/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
-/// Bits[3:0]: If any of these bits are set, the corresponding result
-/// element is cleared.
-/// \returns A 128-bit vector of [4 x float] containing the copied
-/// single-precision floating point elements from the operands.
-#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
-
-/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
-/// returns it, using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_ps(__m128 X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
-/// instruction.
-///
-/// \param X
-/// A 128-bit vector of [4 x float].
-/// \param N
-/// An immediate value. Bits [1:0] determines which bits from the argument
-/// \a X are extracted and returned: \n
-/// 00: Bits [31:0] of parameter \a X are returned. \n
-/// 01: Bits [63:32] of parameter \a X are returned. \n
-/// 10: Bits [95:64] of parameter \a X are returned. \n
-/// 11: Bits [127:96] of parameter \a X are returned.
-/// \returns A 32-bit integer containing the extracted 32 bits of float data.
-#define _mm_extract_ps(X, N) \
- __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
-
-/* Miscellaneous insert and extract macros. */
-/* Extract a single-precision float from X at index N into D. */
-#define _MM_EXTRACT_FLOAT(D, X, N) \
- do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0)
-
-/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
- an index suitable for _mm_insert_ps. */
-#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z))
-
-/* Extract a float from X at index N into the first index of the return. */
-#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \
- _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
-
-/* Insert int into packed integer array at index. */
-/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
-/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
-/// of an integer parameter \a I into an offset specified by the immediate
-/// value parameter \a N.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
-///
-/// \param X
-/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
-/// result and then one of the sixteen elements in the result vector is
-/// replaced by the lower 8 bits of \a I.
-/// \param I
-/// An integer. The lower 8 bits of this operand are written to the result
-/// beginning at the offset specified by \a N.
-/// \param N
-/// An immediate value. Bits [3:0] specify the bit offset in the result at
-/// which the lower 8 bits of \a I are written. \n
-/// 0000: Bits [7:0] of the result are used for insertion. \n
-/// 0001: Bits [15:8] of the result are used for insertion. \n
-/// 0010: Bits [23:16] of the result are used for insertion. \n
-/// 0011: Bits [31:24] of the result are used for insertion. \n
-/// 0100: Bits [39:32] of the result are used for insertion. \n
-/// 0101: Bits [47:40] of the result are used for insertion. \n
-/// 0110: Bits [55:48] of the result are used for insertion. \n
-/// 0111: Bits [63:56] of the result are used for insertion. \n
-/// 1000: Bits [71:64] of the result are used for insertion. \n
-/// 1001: Bits [79:72] of the result are used for insertion. \n
-/// 1010: Bits [87:80] of the result are used for insertion. \n
-/// 1011: Bits [95:88] of the result are used for insertion. \n
-/// 1100: Bits [103:96] of the result are used for insertion. \n
-/// 1101: Bits [111:104] of the result are used for insertion. \n
-/// 1110: Bits [119:112] of the result are used for insertion. \n
-/// 1111: Bits [127:120] of the result are used for insertion.
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi8(X, I, N) \
- ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
- (int)(I), (int)(N)))
-
-/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
-/// the 128-bit integer vector parameter, and then inserting the 32-bit
-/// integer parameter \a I at the offset specified by the immediate value
-/// parameter \a N.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
-///
-/// \param X
-/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
-/// result and then one of the four elements in the result vector is
-/// replaced by \a I.
-/// \param I
-/// A 32-bit integer that is written to the result beginning at the offset
-/// specified by \a N.
-/// \param N
-/// An immediate value. Bits [1:0] specify the bit offset in the result at
-/// which the integer \a I is written. \n
-/// 00: Bits [31:0] of the result are used for insertion. \n
-/// 01: Bits [63:32] of the result are used for insertion. \n
-/// 10: Bits [95:64] of the result are used for insertion. \n
-/// 11: Bits [127:96] of the result are used for insertion.
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi32(X, I, N) \
- ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
- (int)(I), (int)(N)))
-
-#ifdef __x86_64__
-/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
-/// the 128-bit integer vector parameter, and then inserting the 64-bit
-/// integer parameter \a I, using the immediate value parameter \a N as an
-/// insertion location selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
-///
-/// \param X
-/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
-/// result and then one of the two elements in the result vector is replaced
-/// by \a I.
-/// \param I
-/// A 64-bit integer that is written to the result beginning at the offset
-/// specified by \a N.
-/// \param N
-/// An immediate value. Bit [0] specifies the bit offset in the result at
-/// which the integer \a I is written. \n
-/// 0: Bits [63:0] of the result are used for insertion. \n
-/// 1: Bits [127:64] of the result are used for insertion. \n
-/// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi64(X, I, N) \
- ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
- (long long)(I), (int)(N)))
-#endif /* __x86_64__ */
-
-/* Extract int from packed integer array at index. This returns the element
- * as a zero extended value, so it is unsigned.
- */
-/// Extracts an 8-bit element from the 128-bit integer vector of
-/// [16 x i8], using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_epi8(__m128i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
-///
-/// \param X
-/// A 128-bit integer vector.
-/// \param N
-/// An immediate value. Bits [3:0] specify which 8-bit vector element from
-/// the argument \a X to extract and copy to the result. \n
-/// 0000: Bits [7:0] of parameter \a X are extracted. \n
-/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
-/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
-/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
-/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
-/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
-/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
-/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
-/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
-/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
-/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
-/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
-/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
-/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
-/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
-/// 1111: Bits [127:120] of the parameter \a X are extracted.
-/// \returns An unsigned integer, whose lower 8 bits are selected from the
-/// 128-bit integer vector parameter and the remaining bits are assigned
-/// zeros.
-#define _mm_extract_epi8(X, N) \
- ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
- (int)(N)))
-
-/// Extracts a 32-bit element from the 128-bit integer vector of
-/// [4 x i32], using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_epi32(__m128i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
-///
-/// \param X
-/// A 128-bit integer vector.
-/// \param N
-/// An immediate value. Bits [1:0] specify which 32-bit vector element from
-/// the argument \a X to extract and copy to the result. \n
-/// 00: Bits [31:0] of the parameter \a X are extracted. \n
-/// 01: Bits [63:32] of the parameter \a X are extracted. \n
-/// 10: Bits [95:64] of the parameter \a X are extracted. \n
-/// 11: Bits [127:96] of the parameter \a X are exracted.
-/// \returns An integer, whose lower 32 bits are selected from the 128-bit
-/// integer vector parameter and the remaining bits are assigned zeros.
-#define _mm_extract_epi32(X, N) \
- ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
-
-#ifdef __x86_64__
-/// Extracts a 64-bit element from the 128-bit integer vector of
-/// [2 x i64], using the immediate value parameter \a N as a selector.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// long long _mm_extract_epi64(__m128i X, const int N);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
-///
-/// \param X
-/// A 128-bit integer vector.
-/// \param N
-/// An immediate value. Bit [0] specifies which 64-bit vector element from
-/// the argument \a X to return. \n
-/// 0: Bits [63:0] are returned. \n
-/// 1: Bits [127:64] are returned. \n
-/// \returns A 64-bit integer.
-#define _mm_extract_epi64(X, N) \
- ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
-#endif /* __x86_64 */
-
-/* SSE4 128-bit Packed Integer Comparisons. */
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// zeros.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param __M
-/// A 128-bit integer vector containing the bits to be tested.
-/// \param __V
-/// A 128-bit integer vector selecting which bits to test in operand \a __M.
-/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_testz_si128(__m128i __M, __m128i __V)
-{
- return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param __M
-/// A 128-bit integer vector containing the bits to be tested.
-/// \param __V
-/// A 128-bit integer vector selecting which bits to test in operand \a __M.
-/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_testc_si128(__m128i __M, __m128i __V)
-{
- return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-/// neither all zeros nor all ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param __M
-/// A 128-bit integer vector containing the bits to be tested.
-/// \param __V
-/// A 128-bit integer vector selecting which bits to test in operand \a __M.
-/// \returns TRUE if the specified bits are neither all zeros nor all ones;
-/// FALSE otherwise.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_testnzc_si128(__m128i __M, __m128i __V)
-{
- return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
-}
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_test_all_ones(__m128i V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param V
-/// A 128-bit integer vector containing the bits to be tested.
-/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
-/// otherwise.
-#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
-
-/// Tests whether the specified bits in a 128-bit integer vector are
-/// neither all zeros nor all ones.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param M
-/// A 128-bit integer vector containing the bits to be tested.
-/// \param V
-/// A 128-bit integer vector selecting which bits to test in operand \a M.
-/// \returns TRUE if the specified bits are neither all zeros nor all ones;
-/// FALSE otherwise.
-#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
-
-/// Tests whether the specified bits in a 128-bit integer vector are all
-/// zeros.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_test_all_zeros(__m128i M, __m128i V);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
-///
-/// \param M
-/// A 128-bit integer vector containing the bits to be tested.
-/// \param V
-/// A 128-bit integer vector selecting which bits to test in operand \a M.
-/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
-#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
-
-/* SSE4 64-bit Packed Integer Comparisons. */
-/// Compares each of the corresponding 64-bit values of the 128-bit
-/// integer vectors for equality.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
-///
-/// \param __V1
-/// A 128-bit integer vector.
-/// \param __V2
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
-{
- return (__m128i)((__v2di)__V1 == (__v2di)__V2);
-}
-
-/* SSE4 Packed Integer Sign-Extension. */
-/// Sign-extends each of the lower eight 8-bit integer elements of a
-/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
-/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
-/// are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign-
-/// extended to 16-bit values.
-/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi8_epi16(__m128i __V)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
-}
-
-/// Sign-extends each of the lower four 8-bit integer elements of a
-/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
-/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
-/// vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
-/// sign-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi8_epi32(__m128i __V)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Sign-extends each of the lower two 8-bit integer elements of a
-/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
-/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
-/// vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
-/// sign-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi8_epi64(__m128i __V)
-{
- /* This function always performs a signed extension, but __v16qi is a char
- which may be signed or unsigned, so use __v16qs. */
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
-}
-
-/// Sign-extends each of the lower four 16-bit integer elements of a
-/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
-/// a 128-bit vector of [4 x i32]. The upper four elements of the input
-/// vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
-/// sign-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi16_epi32(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Sign-extends each of the lower two 16-bit integer elements of a
-/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
-/// a 128-bit vector of [2 x i64]. The upper six elements of the input
-/// vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
-/// sign-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi16_epi64(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
-}
-
-/// Sign-extends each of the lower two 32-bit integer elements of a
-/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
-/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
-/// are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
-/// sign-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepi32_epi64(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
-}
-
-/* SSE4 Packed Integer Zero-Extension. */
-/// Zero-extends each of the lower eight 8-bit integer elements of a
-/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
-/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
-/// are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
-/// zero-extended to 16-bit values.
-/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu8_epi16(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
-}
-
-/// Zero-extends each of the lower four 8-bit integer elements of a
-/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
-/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
-/// vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
-/// zero-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu8_epi32(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Zero-extends each of the lower two 8-bit integer elements of a
-/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
-/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
-/// vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
-/// zero-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu8_epi64(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
-}
-
-/// Zero-extends each of the lower four 16-bit integer elements of a
-/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
-/// a 128-bit vector of [4 x i32]. The upper four elements of the input
-/// vector are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
-/// zero-extended to 32-bit values.
-/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu16_epi32(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
-}
-
-/// Zero-extends each of the lower two 16-bit integer elements of a
-/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
-/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
-/// are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
-/// zero-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu16_epi64(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
-}
-
-/// Zero-extends each of the lower two 32-bit integer elements of a
-/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
-/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
-/// are unused.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
-///
-/// \param __V
-/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
-/// zero-extended to 64-bit values.
-/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cvtepu32_epi64(__m128i __V)
-{
- return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
-}
-
-/* SSE4 Pack with Unsigned Saturation. */
-/// Converts 32-bit signed integers from both 128-bit integer vector
-/// operands into 16-bit unsigned integers, and returns the packed result.
-/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
-/// 0x0000 are saturated to 0x0000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
-///
-/// \param __V1
-/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-/// signed integer and is converted to a 16-bit unsigned integer with
-/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-/// are written to the lower 64 bits of the result.
-/// \param __V2
-/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
-/// signed integer and is converted to a 16-bit unsigned integer with
-/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
-/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
-/// are written to the higher 64 bits of the result.
-/// \returns A 128-bit vector of [8 x i16] containing the converted values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_packus_epi32(__m128i __V1, __m128i __V2)
-{
- return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
-}
-
-/* SSE4 Multiple Packed Sums of Absolute Difference. */
-/// Subtracts 8-bit unsigned integer values and computes the absolute
-/// values of the differences to the corresponding bits in the destination.
-/// Then sums of the absolute differences are returned according to the bit
-/// fields in the immediate operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
-///
-/// \param X
-/// A 128-bit vector of [16 x i8].
-/// \param Y
-/// A 128-bit vector of [16 x i8].
-/// \param M
-/// An 8-bit immediate operand specifying how the absolute differences are to
-/// be calculated, according to the following algorithm:
-/// \code
-/// // M2 represents bit 2 of the immediate operand
-/// // M10 represents bits [1:0] of the immediate operand
-/// i = M2 * 4;
-/// j = M10 * 4;
-/// for (k = 0; k < 8; k = k + 1) {
-/// d0 = abs(X[i + k + 0] - Y[j + 0]);
-/// d1 = abs(X[i + k + 1] - Y[j + 1]);
-/// d2 = abs(X[i + k + 2] - Y[j + 2]);
-/// d3 = abs(X[i + k + 3] - Y[j + 3]);
-/// r[k] = d0 + d1 + d2 + d3;
-/// }
-/// \endcode
-/// \returns A 128-bit integer vector containing the sums of the sets of
-/// absolute differences between both operands.
-#define _mm_mpsadbw_epu8(X, Y, M) \
- ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
- (__v16qi)(__m128i)(Y), (M)))
-
-/// Finds the minimum unsigned 16-bit element in the input 128-bit
-/// vector of [8 x u16] and returns it and along with its index.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
-/// instruction.
-///
-/// \param __V
-/// A 128-bit vector of [8 x u16].
-/// \returns A 128-bit value where bits [15:0] contain the minimum value found
-/// in parameter \a __V, bits [18:16] contain the index of the minimum value
-/// and the remaining bits are set to 0.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_minpos_epu16(__m128i __V)
-{
- return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V);
-}
-
-/* Handle the sse4.2 definitions here. */
-
-/* These definitions are normally in nmmintrin.h, but gcc puts them in here
- so we'll do the same. */
-
-#undef __DEFAULT_FN_ATTRS
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
-
-/* These specify the type of data that we're comparing. */
-#define _SIDD_UBYTE_OPS 0x00
-#define _SIDD_UWORD_OPS 0x01
-#define _SIDD_SBYTE_OPS 0x02
-#define _SIDD_SWORD_OPS 0x03
-
-/* These specify the type of comparison operation. */
-#define _SIDD_CMP_EQUAL_ANY 0x00
-#define _SIDD_CMP_RANGES 0x04
-#define _SIDD_CMP_EQUAL_EACH 0x08
-#define _SIDD_CMP_EQUAL_ORDERED 0x0c
-
-/* These macros specify the polarity of the operation. */
-#define _SIDD_POSITIVE_POLARITY 0x00
-#define _SIDD_NEGATIVE_POLARITY 0x10
-#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
-#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
-
-/* These macros are used in _mm_cmpXstri() to specify the return. */
-#define _SIDD_LEAST_SIGNIFICANT 0x00
-#define _SIDD_MOST_SIGNIFICANT 0x40
-
-/* These macros are used in _mm_cmpXstri() to specify the return. */
-#define _SIDD_BIT_MASK 0x00
-#define _SIDD_UNIT_MASK 0x40
-
-/* SSE4.2 Packed Comparison Intrinsics. */
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with implicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns a 128-bit integer vector representing the result
-/// mask of the comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words, the type of comparison to perform, and the format of the return
-/// value. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
-/// bytes. \n
-/// 0: The result is zero-extended to 16 bytes. \n
-/// 1: The result is expanded to 16 bytes (this expansion is performed by
-/// repeating each bit 8 or 16 times).
-/// \returns Returns a 128-bit integer vector representing the result mask of
-/// the comparison.
-#define _mm_cmpistrm(A, B, M) \
- ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with implicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns an integer representing the result index of the
-/// comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words, the type of comparison to perform, and the format of the return
-/// value. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// Bit [6]: Determines whether the index of the lowest set bit or the
-/// highest set bit is returned. \n
-/// 0: The index of the least significant set bit. \n
-/// 1: The index of the most significant set bit. \n
-/// \returns Returns an integer representing the result index of the comparison.
-#define _mm_cmpistri(A, B, M) \
- ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with explicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns a 128-bit integer vector representing the result
-/// mask of the comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LA
-/// An integer that specifies the length of the string in \a A.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LB
-/// An integer that specifies the length of the string in \a B.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words, the type of comparison to perform, and the format of the return
-/// value. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
-/// bytes. \n
-/// 0: The result is zero-extended to 16 bytes. \n
-/// 1: The result is expanded to 16 bytes (this expansion is performed by
-/// repeating each bit 8 or 16 times). \n
-/// \returns Returns a 128-bit integer vector representing the result mask of
-/// the comparison.
-#define _mm_cmpestrm(A, LA, B, LB, M) \
- ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
- (__v16qi)(__m128i)(B), (int)(LB), \
- (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with explicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns an integer representing the result index of the
-/// comparison.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LA
-/// An integer that specifies the length of the string in \a A.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LB
-/// An integer that specifies the length of the string in \a B.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words, the type of comparison to perform, and the format of the return
-/// value. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// Bit [6]: Determines whether the index of the lowest set bit or the
-/// highest set bit is returned. \n
-/// 0: The index of the least significant set bit. \n
-/// 1: The index of the most significant set bit. \n
-/// \returns Returns an integer representing the result index of the comparison.
-#define _mm_cmpestri(A, LA, B, LB, M) \
- ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
- (__v16qi)(__m128i)(B), (int)(LB), \
- (int)(M)))
-
-/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with implicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
-/// string in \a B is the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// \returns Returns 1 if the bit mask is zero and the length of the string in
-/// \a B is the maximum; otherwise, returns 0.
-#define _mm_cmpistra(A, B, M) \
- ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with implicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
-/// 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B.
-/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
-#define _mm_cmpistrc(A, B, M) \
- ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with implicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns bit 0 of the resulting bit mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// \returns Returns bit 0 of the resulting bit mask.
-#define _mm_cmpistro(A, B, M) \
- ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with implicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
-/// the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// \returns Returns 1 if the length of the string in \a A is less than the
-/// maximum, otherwise, returns 0.
-#define _mm_cmpistrs(A, B, M) \
- ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with implicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
-/// the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B.
-/// \returns Returns 1 if the length of the string in \a B is less than the
-/// maximum, otherwise, returns 0.
-#define _mm_cmpistrz(A, B, M) \
- ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with explicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
-/// string in \a B is the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LA
-/// An integer that specifies the length of the string in \a A.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LB
-/// An integer that specifies the length of the string in \a B.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B.
-/// \returns Returns 1 if the bit mask is zero and the length of the string in
-/// \a B is the maximum, otherwise, returns 0.
-#define _mm_cmpestra(A, LA, B, LB, M) \
- ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
- (__v16qi)(__m128i)(B), (int)(LB), \
- (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with explicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
-/// returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LA
-/// An integer that specifies the length of the string in \a A.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LB
-/// An integer that specifies the length of the string in \a B.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
-#define _mm_cmpestrc(A, LA, B, LB, M) \
- ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
- (__v16qi)(__m128i)(B), (int)(LB), \
- (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with explicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns bit 0 of the resulting bit mask.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LA
-/// An integer that specifies the length of the string in \a A.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LB
-/// An integer that specifies the length of the string in \a B.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B.
-/// \returns Returns bit 0 of the resulting bit mask.
-#define _mm_cmpestro(A, LA, B, LB, M) \
- ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
- (__v16qi)(__m128i)(B), (int)(LB), \
- (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with explicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
-/// the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
-/// instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LA
-/// An integer that specifies the length of the string in \a A.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LB
-/// An integer that specifies the length of the string in \a B.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement in the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B. \n
-/// \returns Returns 1 if the length of the string in \a A is less than the
-/// maximum, otherwise, returns 0.
-#define _mm_cmpestrs(A, LA, B, LB, M) \
- ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
- (__v16qi)(__m128i)(B), (int)(LB), \
- (int)(M)))
-
-/// Uses the immediate operand \a M to perform a comparison of string
-/// data with explicitly defined lengths that is contained in source operands
-/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
-/// the maximum, otherwise, returns 0.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
-///
-/// \param A
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LA
-/// An integer that specifies the length of the string in \a A.
-/// \param B
-/// A 128-bit integer vector containing one of the source operands to be
-/// compared.
-/// \param LB
-/// An integer that specifies the length of the string in \a B.
-/// \param M
-/// An 8-bit immediate operand specifying whether the characters are bytes or
-/// words and the type of comparison to perform. \n
-/// Bits [1:0]: Determine source data format. \n
-/// 00: 16 unsigned bytes \n
-/// 01: 8 unsigned words \n
-/// 10: 16 signed bytes \n
-/// 11: 8 signed words \n
-/// Bits [3:2]: Determine comparison type and aggregation method. \n
-/// 00: Subset: Each character in \a B is compared for equality with all
-/// the characters in \a A. \n
-/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
-/// basis is greater than or equal for even-indexed elements in \a A,
-/// and less than or equal for odd-indexed elements in \a A. \n
-/// 10: Match: Compare each pair of corresponding characters in \a A and
-/// \a B for equality. \n
-/// 11: Substring: Search \a B for substring matches of \a A. \n
-/// Bits [5:4]: Determine whether to perform a one's complement on the bit
-/// mask of the comparison results. \n
-/// 00: No effect. \n
-/// 01: Negate the bit mask. \n
-/// 10: No effect. \n
-/// 11: Negate the bit mask only for bits with an index less than or equal
-/// to the size of \a A or \a B.
-/// \returns Returns 1 if the length of the string in \a B is less than the
-/// maximum, otherwise, returns 0.
-#define _mm_cmpestrz(A, LA, B, LB, M) \
- ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
- (__v16qi)(__m128i)(B), (int)(LB), \
- (int)(M)))
-
-/* SSE4.2 Compare Packed Data -- Greater Than. */
-/// Compares each of the corresponding 64-bit values of the 128-bit
-/// integer vectors to determine if the values in the first operand are
-/// greater than those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
-///
-/// \param __V1
-/// A 128-bit integer vector.
-/// \param __V2
-/// A 128-bit integer vector.
-/// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
-{
- return (__m128i)((__v2di)__V1 > (__v2di)__V2);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#include <popcntintrin.h>
-
-#include <crc32intrin.h>
-
-#endif /* __SMMINTRIN_H */
+++ /dev/null
-/*===---- tbmintrin.h - TBM intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <tbmintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __TBMINTRIN_H
-#define __TBMINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm")))
-
-#define __bextri_u32(a, b) \
- ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \
- (unsigned int)(b)))
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcfill_u32(unsigned int __a)
-{
- return __a & (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blci_u32(unsigned int __a)
-{
- return __a | ~(__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcic_u32(unsigned int __a)
-{
- return ~__a & (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcmsk_u32(unsigned int __a)
-{
- return __a ^ (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blcs_u32(unsigned int __a)
-{
- return __a | (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsfill_u32(unsigned int __a)
-{
- return __a | (__a - 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__blsic_u32(unsigned int __a)
-{
- return ~__a | (__a - 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__t1mskc_u32(unsigned int __a)
-{
- return ~__a | (__a + 1);
-}
-
-static __inline__ unsigned int __DEFAULT_FN_ATTRS
-__tzmsk_u32(unsigned int __a)
-{
- return ~__a & (__a - 1);
-}
-
-#ifdef __x86_64__
-#define __bextri_u64(a, b) \
- ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \
- (unsigned long long)(b)))
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcfill_u64(unsigned long long __a)
-{
- return __a & (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blci_u64(unsigned long long __a)
-{
- return __a | ~(__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcic_u64(unsigned long long __a)
-{
- return ~__a & (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcmsk_u64(unsigned long long __a)
-{
- return __a ^ (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blcs_u64(unsigned long long __a)
-{
- return __a | (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsfill_u64(unsigned long long __a)
-{
- return __a | (__a - 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__blsic_u64(unsigned long long __a)
-{
- return ~__a | (__a - 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__t1mskc_u64(unsigned long long __a)
-{
- return ~__a | (__a + 1);
-}
-
-static __inline__ unsigned long long __DEFAULT_FN_ATTRS
-__tzmsk_u64(unsigned long long __a)
-{
- return ~__a & (__a - 1);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __TBMINTRIN_H */
+++ /dev/null
-/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __TMMINTRIN_H
-#define __TMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <pmmintrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
-
-/// Computes the absolute value of each of the packed 8-bit signed
-/// integers in the source operand and stores the 8-bit unsigned integer
-/// results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PABSB instruction.
-///
-/// \param __a
-/// A 64-bit vector of [8 x i8].
-/// \returns A 64-bit integer vector containing the absolute values of the
-/// elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_abs_pi8(__m64 __a)
-{
- return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
-}
-
-/// Computes the absolute value of each of the packed 8-bit signed
-/// integers in the source operand and stores the 8-bit unsigned integer
-/// results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSB instruction.
-///
-/// \param __a
-/// A 128-bit vector of [16 x i8].
-/// \returns A 128-bit integer vector containing the absolute values of the
-/// elements in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_abs_epi8(__m128i __a)
-{
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
-#else
- return (__m128i)__builtin_elementwise_abs((__v16qs)__a);
-#endif
-}
-
-/// Computes the absolute value of each of the packed 16-bit signed
-/// integers in the source operand and stores the 16-bit unsigned integer
-/// results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PABSW instruction.
-///
-/// \param __a
-/// A 64-bit vector of [4 x i16].
-/// \returns A 64-bit integer vector containing the absolute values of the
-/// elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_abs_pi16(__m64 __a)
-{
- return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
-}
-
-/// Computes the absolute value of each of the packed 16-bit signed
-/// integers in the source operand and stores the 16-bit unsigned integer
-/// results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSW instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16].
-/// \returns A 128-bit integer vector containing the absolute values of the
-/// elements in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_abs_epi16(__m128i __a)
-{
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
-#else
- return (__m128i)__builtin_elementwise_abs((__v8hi)__a);
-#endif
-}
-
-/// Computes the absolute value of each of the packed 32-bit signed
-/// integers in the source operand and stores the 32-bit unsigned integer
-/// results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PABSD instruction.
-///
-/// \param __a
-/// A 64-bit vector of [2 x i32].
-/// \returns A 64-bit integer vector containing the absolute values of the
-/// elements in the operand.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_abs_pi32(__m64 __a)
-{
- return (__m64)__builtin_ia32_pabsd((__v2si)__a);
-}
-
-/// Computes the absolute value of each of the packed 32-bit signed
-/// integers in the source operand and stores the 32-bit unsigned integer
-/// results in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPABSD instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x i32].
-/// \returns A 128-bit integer vector containing the absolute values of the
-/// elements in the operand.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_abs_epi32(__m128i __a)
-{
-#if (__clang_major__ < 14)
- return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
-#else
- return (__m128i)__builtin_elementwise_abs((__v4si)__a);
-#endif
-}
-
-/// Concatenates the two 128-bit integer vector operands, and
-/// right-shifts the result by the number of bytes specified in the immediate
-/// operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c PALIGNR instruction.
-///
-/// \param a
-/// A 128-bit vector of [16 x i8] containing one of the source operands.
-/// \param b
-/// A 128-bit vector of [16 x i8] containing one of the source operands.
-/// \param n
-/// An immediate operand specifying how many bytes to right-shift the result.
-/// \returns A 128-bit integer vector containing the concatenated right-shifted
-/// value.
-#define _mm_alignr_epi8(a, b, n) \
- ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
- (__v16qi)(__m128i)(b), (n)))
-
-/// Concatenates the two 64-bit integer vector operands, and right-shifts
-/// the result by the number of bytes specified in the immediate operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the \c PALIGNR instruction.
-///
-/// \param a
-/// A 64-bit vector of [8 x i8] containing one of the source operands.
-/// \param b
-/// A 64-bit vector of [8 x i8] containing one of the source operands.
-/// \param n
-/// An immediate operand specifying how many bytes to right-shift the result.
-/// \returns A 64-bit integer vector containing the concatenated right-shifted
-/// value.
-#define _mm_alignr_pi8(a, b, n) \
- ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-/// 128-bit vectors of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDW instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the upper bits of the
-/// destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
-/// both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-/// 128-bit vectors of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDD instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x i32] containing one of the source operands. The
-/// horizontal sums of the values are stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 128-bit vector of [4 x i32] containing one of the source operands. The
-/// horizontal sums of the values are stored in the upper bits of the
-/// destination.
-/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
-/// both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadd_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-/// 64-bit vectors of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHADDW instruction.
-///
-/// \param __a
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the upper bits of the
-/// destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
-/// operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hadd_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-/// 64-bit vectors of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHADDD instruction.
-///
-/// \param __a
-/// A 64-bit vector of [2 x i32] containing one of the source operands. The
-/// horizontal sums of the values are stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 64-bit vector of [2 x i32] containing one of the source operands. The
-/// horizontal sums of the values are stored in the upper bits of the
-/// destination.
-/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
-/// operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hadd_pi32(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
-/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-/// 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHADDSW instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the upper bits of the
-/// destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
-/// sums of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hadds_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally adds the adjacent pairs of values contained in 2 packed
-/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
-/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
-/// 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHADDSW instruction.
-///
-/// \param __a
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the lower bits of the
-/// destination.
-/// \param __b
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal sums of the values are stored in the upper bits of the
-/// destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
-/// sums of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hadds_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 128-bit vectors of [8 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBW instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the lower bits of
-/// the destination.
-/// \param __b
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the upper bits of
-/// the destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
-/// of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 128-bit vectors of [4 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBD instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x i32] containing one of the source operands. The
-/// horizontal differences between the values are stored in the lower bits of
-/// the destination.
-/// \param __b
-/// A 128-bit vector of [4 x i32] containing one of the source operands. The
-/// horizontal differences between the values are stored in the upper bits of
-/// the destination.
-/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
-/// of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsub_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 64-bit vectors of [4 x i16].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHSUBW instruction.
-///
-/// \param __a
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the lower bits of
-/// the destination.
-/// \param __b
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the upper bits of
-/// the destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
-/// of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hsub_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 64-bit vectors of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHSUBD instruction.
-///
-/// \param __a
-/// A 64-bit vector of [2 x i32] containing one of the source operands. The
-/// horizontal differences between the values are stored in the lower bits of
-/// the destination.
-/// \param __b
-/// A 64-bit vector of [2 x i32] containing one of the source operands. The
-/// horizontal differences between the values are stored in the upper bits of
-/// the destination.
-/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
-/// of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hsub_pi32(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
-/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-/// saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPHSUBSW instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the lower bits of
-/// the destination.
-/// \param __b
-/// A 128-bit vector of [8 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the upper bits of
-/// the destination.
-/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
-/// differences of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubs_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Horizontally subtracts the adjacent pairs of values contained in 2
-/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
-/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
-/// saturated to 0x8000.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PHSUBSW instruction.
-///
-/// \param __a
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the lower bits of
-/// the destination.
-/// \param __b
-/// A 64-bit vector of [4 x i16] containing one of the source operands. The
-/// horizontal differences between the values are stored in the upper bits of
-/// the destination.
-/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
-/// differences of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_hsubs_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Multiplies corresponding pairs of packed 8-bit unsigned integer
-/// values contained in the first source operand and packed 8-bit signed
-/// integer values contained in the second source operand, adds pairs of
-/// contiguous products with signed saturation, and writes the 16-bit sums to
-/// the corresponding bits in the destination.
-///
-/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
-/// both operands are multiplied, and the sum of both results is written to
-/// bits [15:0] of the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the first source operand.
-/// \param __b
-/// A 128-bit integer vector containing the second source operand.
-/// \returns A 128-bit integer vector containing the sums of products of both
-/// operands: \n
-/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
-/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
-/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
-/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
-/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
-/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
-/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
-/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddubs_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Multiplies corresponding pairs of packed 8-bit unsigned integer
-/// values contained in the first source operand and packed 8-bit signed
-/// integer values contained in the second source operand, adds pairs of
-/// contiguous products with signed saturation, and writes the 16-bit sums to
-/// the corresponding bits in the destination.
-///
-/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
-/// both operands are multiplied, and the sum of both results is written to
-/// bits [15:0] of the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PMADDUBSW instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing the first source operand.
-/// \param __b
-/// A 64-bit integer vector containing the second source operand.
-/// \returns A 64-bit integer vector containing the sums of products of both
-/// operands: \n
-/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
-/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
-/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
-/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_maddubs_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
-/// products to the 18 most significant bits by right-shifting, rounds the
-/// truncated value by adding 1, and writes bits [16:1] to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPMULHRSW instruction.
-///
-/// \param __a
-/// A 128-bit vector of [8 x i16] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [8 x i16] containing one of the source operands.
-/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
-/// products of both operands.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_mulhrs_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
-/// products to the 18 most significant bits by right-shifting, rounds the
-/// truncated value by adding 1, and writes bits [16:1] to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PMULHRSW instruction.
-///
-/// \param __a
-/// A 64-bit vector of [4 x i16] containing one of the source operands.
-/// \param __b
-/// A 64-bit vector of [4 x i16] containing one of the source operands.
-/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
-/// products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_mulhrs_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Copies the 8-bit integers from a 128-bit integer vector to the
-/// destination or clears 8-bit values in the destination, as specified by
-/// the second source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSHUFB instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 128-bit integer vector containing control bytes corresponding to
-/// positions in the destination:
-/// Bit 7: \n
-/// 1: Clear the corresponding byte in the destination. \n
-/// 0: Copy the selected source byte to the corresponding byte in the
-/// destination. \n
-/// Bits [6:4] Reserved. \n
-/// Bits [3:0] select the source byte to be copied.
-/// \returns A 128-bit integer vector containing the copied or cleared values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shuffle_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// Copies the 8-bit integers from a 64-bit integer vector to the
-/// destination or clears 8-bit values in the destination, as specified by
-/// the second source operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSHUFB instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 64-bit integer vector containing control bytes corresponding to
-/// positions in the destination:
-/// Bit 7: \n
-/// 1: Clear the corresponding byte in the destination. \n
-/// 0: Copy the selected source byte to the corresponding byte in the
-/// destination. \n
-/// Bits [3:0] select the source byte to be copied.
-/// \returns A 64-bit integer vector containing the copied or cleared values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_shuffle_pi8(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
-}
-
-/// For each 8-bit integer in the first source operand, perform one of
-/// the following actions as specified by the second source operand.
-///
-/// If the byte in the second source is negative, calculate the two's
-/// complement of the corresponding byte in the first source, and write that
-/// value to the destination. If the byte in the second source is positive,
-/// copy the corresponding byte from the first source to the destination. If
-/// the byte in the second source is zero, clear the corresponding byte in
-/// the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGNB instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 128-bit integer vector containing control bytes corresponding to
-/// positions in the destination.
-/// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi8(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
-}
-
-/// For each 16-bit integer in the first source operand, perform one of
-/// the following actions as specified by the second source operand.
-///
-/// If the word in the second source is negative, calculate the two's
-/// complement of the corresponding word in the first source, and write that
-/// value to the destination. If the word in the second source is positive,
-/// copy the corresponding word from the first source to the destination. If
-/// the word in the second source is zero, clear the corresponding word in
-/// the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGNW instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 128-bit integer vector containing control words corresponding to
-/// positions in the destination.
-/// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi16(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
-}
-
-/// For each 32-bit integer in the first source operand, perform one of
-/// the following actions as specified by the second source operand.
-///
-/// If the doubleword in the second source is negative, calculate the two's
-/// complement of the corresponding word in the first source, and write that
-/// value to the destination. If the doubleword in the second source is
-/// positive, copy the corresponding word from the first source to the
-/// destination. If the doubleword in the second source is zero, clear the
-/// corresponding word in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c VPSIGND instruction.
-///
-/// \param __a
-/// A 128-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 128-bit integer vector containing control doublewords corresponding to
-/// positions in the destination.
-/// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi32(__m128i __a, __m128i __b)
-{
- return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
-}
-
-/// For each 8-bit integer in the first source operand, perform one of
-/// the following actions as specified by the second source operand.
-///
-/// If the byte in the second source is negative, calculate the two's
-/// complement of the corresponding byte in the first source, and write that
-/// value to the destination. If the byte in the second source is positive,
-/// copy the corresponding byte from the first source to the destination. If
-/// the byte in the second source is zero, clear the corresponding byte in
-/// the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSIGNB instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 64-bit integer vector containing control bytes corresponding to
-/// positions in the destination.
-/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sign_pi8(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
-}
-
-/// For each 16-bit integer in the first source operand, perform one of
-/// the following actions as specified by the second source operand.
-///
-/// If the word in the second source is negative, calculate the two's
-/// complement of the corresponding word in the first source, and write that
-/// value to the destination. If the word in the second source is positive,
-/// copy the corresponding word from the first source to the destination. If
-/// the word in the second source is zero, clear the corresponding word in
-/// the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSIGNW instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 64-bit integer vector containing control words corresponding to
-/// positions in the destination.
-/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sign_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// For each 32-bit integer in the first source operand, perform one of
-/// the following actions as specified by the second source operand.
-///
-/// If the doubleword in the second source is negative, calculate the two's
-/// complement of the corresponding doubleword in the first source, and
-/// write that value to the destination. If the doubleword in the second
-/// source is positive, copy the corresponding doubleword from the first
-/// source to the destination. If the doubleword in the second source is
-/// zero, clear the corresponding doubleword in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c PSIGND instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing the values to be copied.
-/// \param __b
-/// A 64-bit integer vector containing two control doublewords corresponding
-/// to positions in the destination.
-/// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sign_pi32(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
-
-#endif /* __TMMINTRIN_H */
+++ /dev/null
-/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <tsxldtrkintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __TSXLDTRKINTRIN_H
-#define __TSXLDTRKINTRIN_H
-
-/* Define the default attributes for the functions in this file */
-#define _DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk")))
-
-/// Marks the start of an TSX (RTM) suspend load address tracking region. If
-/// this intrinsic is used inside a transactional region, subsequent loads
-/// are not added to the read set of the transaction. If it's used inside a
-/// suspend load address tracking region it will cause transaction abort.
-/// If it's used outside of a transactional region it behaves like a NOP.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c XSUSLDTRK instruction.
-///
-static __inline__ void _DEFAULT_FN_ATTRS
-_xsusldtrk (void)
-{
- __builtin_ia32_xsusldtrk();
-}
-
-/// Marks the end of an TSX (RTM) suspend load address tracking region. If this
-/// intrinsic is used inside a suspend load address tracking region it will
-/// end the suspend region and all following load addresses will be added to
-/// the transaction read set. If it's used inside an active transaction but
-/// not in a suspend region it will cause transaction abort. If it's used
-/// outside of a transactional region it behaves like a NOP.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the \c XRESLDTRK instruction.
-///
-static __inline__ void _DEFAULT_FN_ATTRS
-_xresldtrk (void)
-{
- __builtin_ia32_xresldtrk();
-}
-
-#undef _DEFAULT_FN_ATTRS
-
-#endif /* __TSXLDTRKINTRIN_H */
+++ /dev/null
-/*===------------------ uintrintrin.h - UINTR intrinsics -------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86GPRINTRIN_H
-#error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
-#endif
-
-#ifndef __UINTRINTRIN_H
-#define __UINTRINTRIN_H
-
-/* Define the default attributes for the functions in this file */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("uintr")))
-
-#ifdef __x86_64__
-
-struct __uintr_frame
-{
- unsigned long long rip;
- unsigned long long rflags;
- unsigned long long rsp;
-};
-
-/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a
-/// user interrupt cannot be delivered on the instruction boundary following
-/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in
-/// 64-bit mode, and software is not executing inside an enclave; otherwise,
-/// each causes an invalid-opcode exception. Causes a transactional abort if
-/// executed inside a transactional region; the abort loads EAX as it would
-/// had it been due to an execution of CLI.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> CLUI </c> instruction.
-///
-/// \operation
-/// UIF := 0
-/// \endoperation
-static __inline__ void __DEFAULT_FN_ATTRS
-_clui (void)
-{
- __builtin_ia32_clui();
-}
-
-/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a
-/// user interrupt may be delivered on the instruction boundary following
-/// STUI. Can be executed only if CR4.UINT = 1, the logical processor is in
-/// 64-bit mode, and software is not executing inside an enclave; otherwise,
-/// each causes an invalid-opcode exception. Causes a transactional abort if
-/// executed inside a transactional region; the abort loads EAX as it would
-/// had it been due to an execution of STI.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> STUI </c> instruction.
-///
-/// \operation
-/// UIF := 1
-/// \endoperation
-static __inline__ void __DEFAULT_FN_ATTRS
-_stui (void)
-{
- __builtin_ia32_stui();
-}
-
-/// Get the current value of the user interrupt flag (UIF). Can be executed
-/// regardless of CPL and inside a transactional region. Can be executed only
-/// if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is
-/// not executing inside an enclave; otherwise, it causes an invalid-opcode
-/// exception.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> TESTUI </c> instruction.
-///
-/// \returns The current value of the user interrupt flag (UIF).
-///
-/// \operation
-/// CF := UIF
-/// ZF := 0
-/// AF := 0
-/// OF := 0
-/// PF := 0
-/// SF := 0
-/// dst := CF
-/// \endoperation
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_testui (void)
-{
- return __builtin_ia32_testui();
-}
-
-/// Send interprocessor user interrupt. Can be executed only if
-/// CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode,
-/// and software is not executing inside an enclave; otherwise, it causes an
-/// invalid-opcode exception. May be executed at any privilege level, all of
-/// its memory accesses are performed with supervisor privilege.
-///
-/// \headerfile <x86gprintrin.h>
-///
-/// This intrinsic corresponds to the <c> SENDUIPI </c> instruction
-///
-/// \param __a
-/// Index of user-interrupt target table entry in user-interrupt target
-/// table.
-///
-/// \operation
-/// IF __a > UITTSZ
-/// GP (0)
-/// FI
-/// tempUITTE := MEM[UITTADDR + (a<<4)]
-/// // tempUITTE must be valid, and can't have any reserved bit set
-/// IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0)
-/// GP (0)
-/// FI
-/// tempUPID := MEM[tempUITTE.UPIDADDR] // under lock
-/// // tempUPID can't have any reserved bit set
-/// IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0)
-/// GP (0) // release lock
-/// FI
-/// tempUPID.PIR[tempUITTE.UV] := 1;
-/// IF (tempUPID.SN == 0 AND tempUPID.ON == 0)
-/// tempUPID.ON := 1
-/// sendNotify := 1
-/// ELSE
-/// sendNotify := 0
-/// FI
-/// MEM[tempUITTE.UPIDADDR] := tempUPID // release lock
-/// IF sendNotify == 1
-/// IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode
-/// // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC
-/// // ID tempUPID.NDST
-/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST)
-/// ELSE
-/// // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC
-/// // ID tempUPID.NDST[15:8]
-/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8])
-/// FI
-/// FI
-/// \endoperation
-static __inline__ void __DEFAULT_FN_ATTRS
-_senduipi (unsigned long long __a)
-{
- __builtin_ia32_senduipi(__a);
-}
-
-#endif /* __x86_64__ */
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __UINTRINTRIN_H */
+++ /dev/null
-/*===------------------ vaesintrin.h - VAES intrinsics ---------------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <vaesintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VAESINTRIN_H
-#define __VAESINTRIN_H
-
-/* Default attributes for YMM forms. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256)))
-
-/* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512)))
-
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesenc_epi128(__m256i __A, __m256i __B)
-{
- return (__m256i) __builtin_ia32_aesenc256((__v4di) __A,
- (__v4di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesdec_epi128(__m256i __A, __m256i __B)
-{
- return (__m256i) __builtin_ia32_aesdec256((__v4di) __A,
- (__v4di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesenclast_epi128(__m256i __A, __m256i __B)
-{
- return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A,
- (__v4di) __B);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS
- _mm256_aesdeclast_epi128(__m256i __A, __m256i __B)
-{
- return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A,
- (__v4di) __B);
-}
-
-#ifdef __AVX512FINTRIN_H
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenc_epi128(__m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_aesenc512((__v8di) __A,
- (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesdec_epi128(__m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_aesdec512((__v8di) __A,
- (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesenclast_epi128(__m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A,
- (__v8di) __B);
-}
-
-static __inline__ __m512i __DEFAULT_FN_ATTRS_F
- _mm512_aesdeclast_epi128(__m512i __A, __m512i __B)
-{
- return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A,
- (__v8di) __B);
-}
-#endif // __AVX512FINTRIN_H
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_F
-
-#endif // __VAESINTRIN_H
+++ /dev/null
-/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------===
- *
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#ifndef __IMMINTRIN_H
-#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __VPCLMULQDQINTRIN_H
-#define __VPCLMULQDQINTRIN_H
-
-#define _mm256_clmulepi64_epi128(A, B, I) \
- ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \
- (__v4di)(__m256i)(B), \
- (char)(I)))
-
-#ifdef __AVX512FINTRIN_H
-#define _mm512_clmulepi64_epi128(A, B, I) \
- ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \
- (__v8di)(__m512i)(B), \
- (char)(I)))
-#endif // __AVX512FINTRIN_H
-
-#endif /* __VPCLMULQDQINTRIN_H */
-
+++ /dev/null
-/*===----------------------- waitpkgintrin.h - WAITPKG --------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <waitpkgintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __WAITPKGINTRIN_H
-#define __WAITPKGINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS \
- __attribute__((__always_inline__, __nodebug__, __target__("waitpkg")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_umonitor (void * __address)
-{
- __builtin_ia32_umonitor (__address);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_umwait (unsigned int __control, unsigned long long __counter)
-{
- return __builtin_ia32_umwait (__control,
- (unsigned int)(__counter >> 32), (unsigned int)__counter);
-}
-
-static __inline__ unsigned char __DEFAULT_FN_ATTRS
-_tpause (unsigned int __control, unsigned long long __counter)
-{
- return __builtin_ia32_tpause (__control,
- (unsigned int)(__counter >> 32), (unsigned int)__counter);
-}
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif /* __WAITPKGINTRIN_H */
+++ /dev/null
-/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H
-#error "Never use <wbnoinvdintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __WBNOINVDINTRIN_H
-#define __WBNOINVDINTRIN_H
-
-static __inline__ void
- __attribute__((__always_inline__, __nodebug__, __target__("wbnoinvd")))
-_wbnoinvd (void)
-{
- __builtin_ia32_wbnoinvd ();
-}
-
-#endif /* __WBNOINVDINTRIN_H */
+++ /dev/null
-/*===---- wmmintrin.h - AES intrinsics ------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __WMMINTRIN_H
-#define __WMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <emmintrin.h>
-
-#include <__wmmintrin_aes.h>
-
-#include <__wmmintrin_pclmul.h>
-
-#endif /* __WMMINTRIN_H */
+++ /dev/null
-/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86GPRINTRIN_H
-#define __X86GPRINTRIN_H
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__HRESET__)
-#include <hresetintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__UINTR__)
-#include <uintrintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__CRC32__)
-#include <crc32intrin.h>
-#endif
-
-#define __SSC_MARK(Tag) \
- __asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; " \
- "mov {%0, %%ebx|ebx, %0}; " \
- ".byte 0x64, 0x67, 0x90; " \
- "mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag) \
- : "%eax");
-
-#endif /* __X86GPRINTRIN_H */
+++ /dev/null
-/*===---- x86intrin.h - X86 intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#define __X86INTRIN_H
-
-#include <ia32intrin.h>
-
-#include <immintrin.h>
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__3dNOW__)
-#include <mm3dnow.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__PRFCHW__)
-#include <prfchwintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__SSE4A__)
-#include <ammintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__FMA4__)
-#include <fma4intrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__XOP__)
-#include <xopintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__TBM__)
-#include <tbmintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__LWP__)
-#include <lwpintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__MWAITX__)
-#include <mwaitxintrin.h>
-#endif
-
-#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
- defined(__CLZERO__)
-#include <clzerointrin.h>
-#endif
-
-
-#endif /* __X86INTRIN_H */
+++ /dev/null
-/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __XMMINTRIN_H
-#define __XMMINTRIN_H
-
-#if !defined(__i386__) && !defined(__x86_64__)
-#error "This header is only meant to be used on x86 and x64 architecture"
-#endif
-
-#include <mmintrin.h>
-
-typedef int __v4si __attribute__((__vector_size__(16)));
-typedef float __v4sf __attribute__((__vector_size__(16)));
-typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
-
-typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
-
-/* Unsigned types */
-typedef unsigned int __v4su __attribute__((__vector_size__(16)));
-
-/* This header should only be included in a hosted environment as it depends on
- * a standard library to provide allocation routines. */
-#if __STDC_HOSTED__
-#include <mm_malloc.h>
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
-
-/// Adds the 32-bit float values in the low-order bits of the operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The lower 32 bits of this operand are used in the calculation.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The lower 32 bits of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
-/// of the lower 32 bits of both operands. The upper 96 bits are copied from
-/// the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ss(__m128 __a, __m128 __b)
-{
- __a[0] += __b[0];
- return __a;
-}
-
-/// Adds two 128-bit vectors of [4 x float], and returns the results of
-/// the addition.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the sums of both
-/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_add_ps(__m128 __a, __m128 __b)
-{
- return (__m128)((__v4sf)__a + (__v4sf)__b);
-}
-
-/// Subtracts the 32-bit float value in the low-order bits of the second
-/// operand from the corresponding value in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
-/// of this operand are used in the calculation.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
-/// bits of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-/// difference of the lower 32 bits of both operands. The upper 96 bits are
-/// copied from the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ss(__m128 __a, __m128 __b)
-{
- __a[0] -= __b[0];
- return __a;
-}
-
-/// Subtracts each of the values of the second operand from the first
-/// operand, both of which are 128-bit vectors of [4 x float] and returns
-/// the results of the subtraction.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the minuend.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing the subtrahend.
-/// \returns A 128-bit vector of [4 x float] containing the differences between
-/// both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sub_ps(__m128 __a, __m128 __b)
-{
- return (__m128)((__v4sf)__a - (__v4sf)__b);
-}
-
-/// Multiplies two 32-bit float values in the low-order bits of the
-/// operands.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The lower 32 bits of this operand are used in the calculation.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// The lower 32 bits of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the product of the lower
-/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
-/// bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ss(__m128 __a, __m128 __b)
-{
- __a[0] *= __b[0];
- return __a;
-}
-
-/// Multiplies two 128-bit vectors of [4 x float] and returns the
-/// results of the multiplication.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the products of both
-/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_mul_ps(__m128 __a, __m128 __b)
-{
- return (__m128)((__v4sf)__a * (__v4sf)__b);
-}
-
-/// Divides the value in the low-order 32 bits of the first operand by
-/// the corresponding value in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
-/// bits of this operand are used in the calculation.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
-/// of this operand are used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the quotients of the
-/// lower 32 bits of both operands. The upper 96 bits are copied from the
-/// upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ss(__m128 __a, __m128 __b)
-{
- __a[0] /= __b[0];
- return __a;
-}
-
-/// Divides two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the dividend.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing the divisor.
-/// \returns A 128-bit vector of [4 x float] containing the quotients of both
-/// operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_div_ps(__m128 __a, __m128 __b)
-{
- return (__m128)((__v4sf)__a / (__v4sf)__b);
-}
-
-/// Calculates the square root of the value stored in the low-order bits
-/// of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the square root of the
-/// value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ss(__m128 __a)
-{
- return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
-}
-
-/// Calculates the square roots of the values stored in a 128-bit vector
-/// of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the square roots of the
-/// values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_sqrt_ps(__m128 __a)
-{
- return __builtin_ia32_sqrtps((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocal of the value stored in the
-/// low-order bits of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-/// reciprocal of the value in the low-order bits of the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rcp_ss(__m128 __a)
-{
- return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocals of the values stored in a
-/// 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-/// reciprocals of the values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rcp_ps(__m128 __a)
-{
- return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocal of the square root of the value
-/// stored in the low-order bits of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the calculation.
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-/// reciprocal of the square root of the value in the low-order bits of the
-/// operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rsqrt_ss(__m128 __a)
-{
- return __builtin_ia32_rsqrtss((__v4sf)__a);
-}
-
-/// Calculates the approximate reciprocals of the square roots of the
-/// values stored in a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the approximate
-/// reciprocals of the square roots of the values in the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_rsqrt_ps(__m128 __a)
-{
- return __builtin_ia32_rsqrtps((__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands and returns the lesser value in the low-order bits of the
-/// vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-/// minimum value between both operands. The upper 96 bits are copied from
-/// the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_min_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 128-bit vectors of [4 x float] and returns the lesser
-/// of each pair of values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands.
-/// \returns A 128-bit vector of [4 x float] containing the minimum values
-/// between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_min_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands and returns the greater value in the low-order bits of a 128-bit
-/// vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-/// maximum value between both operands. The upper 96 bits are copied from
-/// the upper 96 bits of the first source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_max_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 128-bit vectors of [4 x float] and returns the greater
-/// of each pair of values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands.
-/// \returns A 128-bit vector of [4 x float] containing the maximum values
-/// between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_max_ps(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector containing one of the source operands.
-/// \param __b
-/// A 128-bit vector containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
-/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_and_ps(__m128 __a, __m128 __b)
-{
- return (__m128)((__v4su)__a & (__v4su)__b);
-}
-
-/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
-/// the one's complement of the values contained in the first source
-/// operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the first source operand. The
-/// one's complement of this value is used in the bitwise AND.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing the second source operand.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
-/// one's complement of the first operand and the values in the second
-/// operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_andnot_ps(__m128 __a, __m128 __b)
-{
- return (__m128)(~(__v4su)__a & (__v4su)__b);
-}
-
-/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
-/// values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_or_ps(__m128 __a, __m128 __b)
-{
- return (__m128)((__v4su)__a | (__v4su)__b);
-}
-
-/// Performs a bitwise exclusive OR of two 128-bit vectors of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the source operands.
-/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
-/// of the values between both operands.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_xor_ps(__m128 __a, __m128 __b)
-{
- return (__m128)((__v4su)__a ^ (__v4su)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands for equality and returns the result of the comparison in the
-/// low-order bits of a vector [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpeq_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] for equality.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpeq_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is less than the
-/// corresponding value in the second operand and returns the result of the
-/// comparison in the low-order bits of a vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmplt_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are less than those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmplt_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is less than or
-/// equal to the corresponding value in the second operand and returns the
-/// result of the comparison in the low-order bits of a vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmple_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are less than or equal to those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmple_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is greater than
-/// the corresponding value in the second operand and returns the result of
-/// the comparison in the low-order bits of a vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpgt_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_shufflevector((__v4sf)__a,
- (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
- 4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are greater than those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpgt_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is greater than
-/// or equal to the corresponding value in the second operand and returns
-/// the result of the comparison in the low-order bits of a vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpge_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_shufflevector((__v4sf)__a,
- (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
- 4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are greater than or equal to those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpge_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands for inequality and returns the result of the comparison in the
-/// low-order bits of a vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpneq_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] for inequality.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpneq_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is not less than
-/// the corresponding value in the second operand and returns the result of
-/// the comparison in the low-order bits of a vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnlt_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are not less than those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnlt_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is not less than
-/// or equal to the corresponding value in the second operand and returns
-/// the result of the comparison in the low-order bits of a vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnle_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are not less than or equal to those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnle_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is not greater
-/// than the corresponding value in the second operand and returns the
-/// result of the comparison in the low-order bits of a vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpngt_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_shufflevector((__v4sf)__a,
- (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
- 4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are not greater than those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpngt_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is not greater
-/// than or equal to the corresponding value in the second operand and
-/// returns the result of the comparison in the low-order bits of a vector
-/// of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnge_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_shufflevector((__v4sf)__a,
- (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
- 4, 1, 2, 3);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are not greater than or equal to those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpnge_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is ordered with
-/// respect to the corresponding value in the second operand and returns the
-/// result of the comparison in the low-order bits of a vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpord_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are ordered with respect to those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpord_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the value in the first operand is unordered
-/// with respect to the corresponding value in the second operand and
-/// returns the result of the comparison in the low-order bits of a vector
-/// of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float] containing one of the operands. The lower
-/// 32 bits of this operand are used in the comparison.
-/// \returns A 128-bit vector of [4 x float] containing the comparison results
-/// in the low-order bits.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpunord_ss(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares each of the corresponding 32-bit float values of the
-/// 128-bit vectors of [4 x float] to determine if the values in the first
-/// operand are unordered with respect to those in the second operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// \returns A 128-bit vector of [4 x float] containing the comparison results.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cmpunord_ps(__m128 __a, __m128 __b)
-{
- return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands for equality and returns the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-/// two lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comieq_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the first operand is less than the second
-/// operand and returns the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comilt_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the first operand is less than or equal to the
-/// second operand and returns the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comile_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the first operand is greater than the second
-/// operand and returns the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-/// two lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comigt_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the first operand is greater than or equal to
-/// the second operand and returns the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comige_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Compares two 32-bit float values in the low-order bits of both
-/// operands to determine if the first operand is not equal to the second
-/// operand and returns the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 1 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the
-/// two lower 32-bit values is NaN, 1 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_comineq_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine equality and returns
-/// the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomieq_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine if the first operand is
-/// less than the second operand and returns the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomilt_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine if the first operand is
-/// less than or equal to the second operand and returns the result of the
-/// comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomile_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine if the first operand is
-/// greater than the second operand and returns the result of the
-/// comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomigt_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine if the first operand is
-/// greater than or equal to the second operand and returns the result of
-/// the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 0 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 0 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomige_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Performs an unordered comparison of two 32-bit float values using
-/// the low-order bits of both operands to determine inequality and returns
-/// the result of the comparison.
-///
-/// If either of the two lower 32-bit values is NaN, 1 is returned.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \param __b
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the comparison.
-/// \returns An integer containing the comparison results. If either of the two
-/// lower 32-bit values is NaN, 1 is returned.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_ucomineq_ss(__m128 __a, __m128 __b)
-{
- return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
-}
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-/// [4 x float] into a 32-bit integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtss_si32(__m128 __a)
-{
- return __builtin_ia32_cvtss2si((__v4sf)__a);
-}
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-/// [4 x float] into a 32-bit integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvt_ss2si(__m128 __a)
-{
- return _mm_cvtss_si32(__a);
-}
-
-#ifdef __x86_64__
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-/// [4 x float] into a 64-bit integer.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the conversion.
-/// \returns A 64-bit integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvtss_si64(__m128 __a)
-{
- return __builtin_ia32_cvtss2si64((__v4sf)__a);
-}
-
-#endif
-
-/// Converts two low-order float values in a 128-bit vector of
-/// [4 x float] into a 64-bit vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtps_pi32(__m128 __a)
-{
- return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
-}
-
-/// Converts two low-order float values in a 128-bit vector of
-/// [4 x float] into a 64-bit vector of [2 x i32].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvt_ps2pi(__m128 __a)
-{
- return _mm_cvtps_pi32(__a);
-}
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-/// [4 x float] into a 32-bit integer, truncating the result when it is
-/// inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvttss_si32(__m128 __a)
-{
- return __builtin_ia32_cvttss2si((__v4sf)__a);
-}
-
-/// Converts a float value contained in the lower 32 bits of a vector of
-/// [4 x float] into a 32-bit integer, truncating the result when it is
-/// inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the conversion.
-/// \returns A 32-bit integer containing the converted value.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_cvtt_ss2si(__m128 __a)
-{
- return _mm_cvttss_si32(__a);
-}
-
-#ifdef __x86_64__
-/// Converts a float value contained in the lower 32 bits of a vector of
-/// [4 x float] into a 64-bit integer, truncating the result when it is
-/// inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the conversion.
-/// \returns A 64-bit integer containing the converted value.
-static __inline__ long long __DEFAULT_FN_ATTRS
-_mm_cvttss_si64(__m128 __a)
-{
- return __builtin_ia32_cvttss2si64((__v4sf)__a);
-}
-#endif
-
-/// Converts two low-order float values in a 128-bit vector of
-/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
-/// when it is inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
-/// instructions.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvttps_pi32(__m128 __a)
-{
- return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
-}
-
-/// Converts two low-order float values in a 128-bit vector of [4 x
-/// float] into a 64-bit vector of [2 x i32], truncating the result when it
-/// is inexact.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \returns A 64-bit integer vector containing the converted values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtt_ps2pi(__m128 __a)
-{
- return _mm_cvttps_pi32(__a);
-}
-
-/// Converts a 32-bit signed integer value into a floating point value
-/// and writes it to the lower 32 bits of the destination. The remaining
-/// higher order elements of the destination vector are copied from the
-/// corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 32-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-/// converted value of the second operand. The upper 96 bits are copied from
-/// the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi32_ss(__m128 __a, int __b)
-{
- __a[0] = __b;
- return __a;
-}
-
-/// Converts a 32-bit signed integer value into a floating point value
-/// and writes it to the lower 32 bits of the destination. The remaining
-/// higher order elements of the destination are copied from the
-/// corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 32-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-/// converted value of the second operand. The upper 96 bits are copied from
-/// the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvt_si2ss(__m128 __a, int __b)
-{
- return _mm_cvtsi32_ss(__a, __b);
-}
-
-#ifdef __x86_64__
-
-/// Converts a 64-bit signed integer value into a floating point value
-/// and writes it to the lower 32 bits of the destination. The remaining
-/// higher order elements of the destination are copied from the
-/// corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 64-bit signed integer operand containing the value to be converted.
-/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
-/// converted value of the second operand. The upper 96 bits are copied from
-/// the upper 96 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_cvtsi64_ss(__m128 __a, long long __b)
-{
- __a[0] = __b;
- return __a;
-}
-
-#endif
-
-/// Converts two elements of a 64-bit vector of [2 x i32] into two
-/// floating point values and writes them to the lower 64-bits of the
-/// destination. The remaining higher order elements of the destination are
-/// copied from the corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
-/// and written to the corresponding low-order elements in the destination.
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-/// converted value of the second operand. The upper 64 bits are copied from
-/// the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi32_ps(__m128 __a, __m64 __b)
-{
- return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
-}
-
-/// Converts two elements of a 64-bit vector of [2 x i32] into two
-/// floating point values and writes them to the lower 64-bits of the
-/// destination. The remaining higher order elements of the destination are
-/// copied from the corresponding elements in the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float].
-/// \param __b
-/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
-/// and written to the corresponding low-order elements in the destination.
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-/// converted value from the second operand. The upper 64 bits are copied
-/// from the upper 64 bits of the first operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvt_pi2ps(__m128 __a, __m64 __b)
-{
- return _mm_cvtpi32_ps(__a, __b);
-}
-
-/// Extracts a float value contained in the lower 32 bits of a vector of
-/// [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
-/// used in the extraction.
-/// \returns A 32-bit float containing the extracted value.
-static __inline__ float __DEFAULT_FN_ATTRS
-_mm_cvtss_f32(__m128 __a)
-{
- return __a[0];
-}
-
-/// Loads two packed float values from the address \a __p into the
-/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
-/// are copied from the low-order bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
-/// of the destination.
-/// \param __p
-/// A pointer to two packed float values. Bits [63:0] are written to bits
-/// [127:64] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadh_pi(__m128 __a, const __m64 *__p)
-{
- typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
- struct __mm_loadh_pi_struct {
- __mm_loadh_pi_v2f32 __u;
- } __attribute__((__packed__, __may_alias__));
- __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
- __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
- return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
-}
-
-/// Loads two packed float values from the address \a __p into the
-/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
-/// are copied from the high-order bits of the first operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
-/// [127:64] of the destination.
-/// \param __p
-/// A pointer to two packed float values. Bits [63:0] are written to bits
-/// [63:0] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the moved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadl_pi(__m128 __a, const __m64 *__p)
-{
- typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
- struct __mm_loadl_pi_struct {
- __mm_loadl_pi_v2f32 __u;
- } __attribute__((__packed__, __may_alias__));
- __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
- __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
- return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-/// 32 bits of the vector are initialized with the single-precision
-/// floating-point value loaded from a specified memory location. The upper
-/// 96 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 32-bit memory location containing a single-precision
-/// floating-point value.
-/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
-/// lower 32 bits contain the value loaded from the memory location. The
-/// upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_load_ss(const float *__p)
-{
- struct __mm_load_ss_struct {
- float __u;
- } __attribute__((__packed__, __may_alias__));
- float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
- return __extension__ (__m128){ __u, 0, 0, 0 };
-}
-
-/// Loads a 32-bit float value and duplicates it to all four vector
-/// elements of a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
-/// instruction.
-///
-/// \param __p
-/// A pointer to a float value to be loaded and duplicated.
-/// \returns A 128-bit vector of [4 x float] containing the loaded and
-/// duplicated values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_load1_ps(const float *__p)
-{
- struct __mm_load1_ps_struct {
- float __u;
- } __attribute__((__packed__, __may_alias__));
- float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
- return __extension__ (__m128){ __u, __u, __u, __u };
-}
-
-#define _mm_load_ps1(p) _mm_load1_ps(p)
-
-/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location has to be 128-bit aligned.
-/// \returns A 128-bit vector of [4 x float] containing the loaded values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_load_ps(const float *__p)
-{
- return *(const __m128*)__p;
-}
-
-/// Loads a 128-bit floating-point vector of [4 x float] from an
-/// unaligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \returns A 128-bit vector of [4 x float] containing the loaded values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadu_ps(const float *__p)
-{
- struct __loadu_ps {
- __m128_u __v;
- } __attribute__((__packed__, __may_alias__));
- return ((const struct __loadu_ps*)__p)->__v;
-}
-
-/// Loads four packed float values, in reverse order, from an aligned
-/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
-/// instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location has to be 128-bit aligned.
-/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
-/// in reverse order.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_loadr_ps(const float *__p)
-{
- __m128 __a = _mm_load_ps(__p);
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
-}
-
-/// Create a 128-bit vector of [4 x float] with undefined values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic has no corresponding instruction.
-///
-/// \returns A 128-bit vector of [4 x float] containing undefined values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_undefined_ps(void)
-{
- return (__m128)__builtin_ia32_undef128();
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-/// 32 bits of the vector are initialized with the specified single-precision
-/// floating-point value. The upper 96 bits are set to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
-///
-/// \param __w
-/// A single-precision floating-point value used to initialize the lower 32
-/// bits of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
-/// lower 32 bits contain the value provided in the source operand. The
-/// upper 96 bits are set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ss(float __w)
-{
- return __extension__ (__m128){ __w, 0, 0, 0 };
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float], with each
-/// of the four single-precision floating-point vector elements set to the
-/// specified single-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
-///
-/// \param __w
-/// A single-precision floating-point value used to initialize each vector
-/// element of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set1_ps(float __w)
-{
- return __extension__ (__m128){ __w, __w, __w, __w };
-}
-
-/* Microsoft specific. */
-/// Constructs a 128-bit floating-point vector of [4 x float], with each
-/// of the four single-precision floating-point vector elements set to the
-/// specified single-precision floating-point value.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
-///
-/// \param __w
-/// A single-precision floating-point value used to initialize each vector
-/// element of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps1(float __w)
-{
- return _mm_set1_ps(__w);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]
-/// initialized with the specified single-precision floating-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __z
-/// A single-precision floating-point value used to initialize bits [127:96]
-/// of the result.
-/// \param __y
-/// A single-precision floating-point value used to initialize bits [95:64]
-/// of the result.
-/// \param __x
-/// A single-precision floating-point value used to initialize bits [63:32]
-/// of the result.
-/// \param __w
-/// A single-precision floating-point value used to initialize bits [31:0]
-/// of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_set_ps(float __z, float __y, float __x, float __w)
-{
- return __extension__ (__m128){ __w, __x, __y, __z };
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float],
-/// initialized in reverse order with the specified 32-bit single-precision
-/// float-point values.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic is a utility function and does not correspond to a specific
-/// instruction.
-///
-/// \param __z
-/// A single-precision floating-point value used to initialize bits [31:0]
-/// of the result.
-/// \param __y
-/// A single-precision floating-point value used to initialize bits [63:32]
-/// of the result.
-/// \param __x
-/// A single-precision floating-point value used to initialize bits [95:64]
-/// of the result.
-/// \param __w
-/// A single-precision floating-point value used to initialize bits [127:96]
-/// of the result.
-/// \returns An initialized 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setr_ps(float __z, float __y, float __x, float __w)
-{
- return __extension__ (__m128){ __z, __y, __x, __w };
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float] initialized
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
-///
-/// \returns An initialized 128-bit floating-point vector of [4 x float] with
-/// all elements set to zero.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_setzero_ps(void)
-{
- return __extension__ (__m128){ 0, 0, 0, 0 };
-}
-
-/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
-///
-/// \param __p
-/// A pointer to a 64-bit memory location.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeh_pi(__m64 *__p, __m128 __a)
-{
- typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
- struct __mm_storeh_pi_struct {
- __mm_storeh_pi_v2f32 __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
-}
-
-/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a memory location that will receive the float values.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storel_pi(__m64 *__p, __m128 __a)
-{
- typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
- struct __mm_storeh_pi_struct {
- __mm_storeh_pi_v2f32 __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
-}
-
-/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
-/// memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 32-bit memory location.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_ss(float *__p, __m128 __a)
-{
- struct __mm_store_ss_struct {
- float __u;
- } __attribute__((__packed__, __may_alias__));
- ((struct __mm_store_ss_struct*)__p)->__u = __a[0];
-}
-
-/// Stores a 128-bit vector of [4 x float] to an unaligned memory
-/// location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location does not have to be aligned.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_ps(float *__p, __m128 __a)
-{
- struct __storeu_ps {
- __m128_u __v;
- } __attribute__((__packed__, __may_alias__));
- ((struct __storeu_ps*)__p)->__v = __a;
-}
-
-/// Stores a 128-bit vector of [4 x float] into an aligned memory
-/// location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location has to be 16-byte aligned.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_ps(float *__p, __m128 __a)
-{
- *(__m128*)__p = __a;
-}
-
-/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
-/// four contiguous elements in an aligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
-/// instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location.
-/// \param __a
-/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-/// of the four contiguous elements pointed by \a __p.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store1_ps(float *__p, __m128 __a)
-{
- __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
- _mm_store_ps(__p, __a);
-}
-
-/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
-/// four contiguous elements in an aligned memory location.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
-/// instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location.
-/// \param __a
-/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
-/// of the four contiguous elements pointed by \a __p.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_store_ps1(float *__p, __m128 __a)
-{
- _mm_store1_ps(__p, __a);
-}
-
-/// Stores float values from a 128-bit vector of [4 x float] to an
-/// aligned memory location in reverse order.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
-/// instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit memory location. The address of the memory
-/// location has to be 128-bit aligned.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storer_ps(float *__p, __m128 __a)
-{
- __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
- _mm_store_ps(__p, __a);
-}
-
-#define _MM_HINT_ET0 7
-#define _MM_HINT_ET1 6
-#define _MM_HINT_T0 3
-#define _MM_HINT_T1 2
-#define _MM_HINT_T2 1
-#define _MM_HINT_NTA 0
-
-#ifndef _MSC_VER
-/* FIXME: We have to #define this because "sel" must be a constant integer, and
- Sema doesn't do any form of constant propagation yet. */
-
-/// Loads one cache line of data from the specified address to a location
-/// closer to the processor.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// void _mm_prefetch(const void * a, const int sel);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
-///
-/// \param a
-/// A pointer to a memory location containing a cache line of data.
-/// \param sel
-/// A predefined integer constant specifying the type of prefetch
-/// operation: \n
-/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
-/// PREFETCHNTA instruction will be generated. \n
-/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
-/// be generated. \n
-/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
-/// be generated. \n
-/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
-/// be generated.
-#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
- ((sel) >> 2) & 1, (sel) & 0x3))
-#endif
-
-/// Stores a 64-bit integer in the specified aligned memory location. To
-/// minimize caching, the data is flagged as non-temporal (unlikely to be
-/// used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
-///
-/// \param __p
-/// A pointer to an aligned memory location used to store the register value.
-/// \param __a
-/// A 64-bit integer containing the value to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
-_mm_stream_pi(__m64 *__p, __m64 __a)
-{
- __builtin_ia32_movntq(__p, __a);
-}
-
-/// Moves packed float values from a 128-bit vector of [4 x float] to a
-/// 128-bit aligned memory location. To minimize caching, the data is flagged
-/// as non-temporal (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
-///
-/// \param __p
-/// A pointer to a 128-bit aligned memory location that will receive the
-/// single-precision floating-point values.
-/// \param __a
-/// A 128-bit vector of [4 x float] containing the values to be moved.
-static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ps(float *__p, __m128 __a)
-{
- __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
-}
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// Forces strong memory ordering (serialization) between store
-/// instructions preceding this instruction and store instructions following
-/// this instruction, ensuring the system completes all previous stores
-/// before executing subsequent stores.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
-///
-void _mm_sfence(void);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
-/// returns it, as specified by the immediate integer operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// int _mm_extract_pi16(__m64 a, int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
-///
-/// \param a
-/// A 64-bit vector of [4 x i16].
-/// \param n
-/// An immediate integer operand that determines which bits are extracted: \n
-/// 0: Bits [15:0] are copied to the destination. \n
-/// 1: Bits [31:16] are copied to the destination. \n
-/// 2: Bits [47:32] are copied to the destination. \n
-/// 3: Bits [63:48] are copied to the destination.
-/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
-#define _mm_extract_pi16(a, n) \
- ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
-
-/// Copies data from the 64-bit vector of [4 x i16] to the destination,
-/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
-/// specified by the immediate operand \a n.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
-///
-/// \param a
-/// A 64-bit vector of [4 x i16].
-/// \param d
-/// An integer. The lower 16-bit value from this operand is written to the
-/// destination at the offset specified by operand \a n.
-/// \param n
-/// An immediate integer operant that determines which the bits to be used
-/// in the destination. \n
-/// 0: Bits [15:0] are copied to the destination. \n
-/// 1: Bits [31:16] are copied to the destination. \n
-/// 2: Bits [47:32] are copied to the destination. \n
-/// 3: Bits [63:48] are copied to the destination. \n
-/// The remaining bits in the destination are copied from the corresponding
-/// bits in operand \a a.
-/// \returns A 64-bit integer vector containing the copied packed data from the
-/// operands.
-#define _mm_insert_pi16(a, d, n) \
- ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
-
-/// Compares each of the corresponding packed 16-bit integer values of
-/// the 64-bit integer vectors, and writes the greater value to the
-/// corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_max_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Compares each of the corresponding packed 8-bit unsigned integer
-/// values of the 64-bit integer vectors, and writes the greater value to the
-/// corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_max_pu8(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Compares each of the corresponding packed 16-bit integer values of
-/// the 64-bit integer vectors, and writes the lesser value to the
-/// corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_min_pi16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Compares each of the corresponding packed 8-bit unsigned integer
-/// values of the 64-bit integer vectors, and writes the lesser value to the
-/// corresponding bits in the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the comparison results.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_min_pu8(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Takes the most significant bit from each 8-bit element in a 64-bit
-/// integer vector to create an 8-bit mask value. Zero-extends the value to
-/// 32-bit integer and writes it to the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing the values with bits to be extracted.
-/// \returns The most significant bit from each 8-bit element in \a __a,
-/// written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_MMX
-_mm_movemask_pi8(__m64 __a)
-{
- return __builtin_ia32_pmovmskb((__v8qi)__a);
-}
-
-/// Multiplies packed 16-bit unsigned integer values and writes the
-/// high-order 16 bits of each 32-bit product to the corresponding bits in
-/// the destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the products of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_mulhi_pu16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
-/// destination, as specified by the immediate value operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
-///
-/// \param a
-/// A 64-bit integer vector containing the values to be shuffled.
-/// \param n
-/// An immediate value containing an 8-bit value specifying which elements to
-/// copy from \a a. The destinations within the 64-bit destination are
-/// assigned values as follows: \n
-/// Bits [1:0] are used to assign values to bits [15:0] in the
-/// destination. \n
-/// Bits [3:2] are used to assign values to bits [31:16] in the
-/// destination. \n
-/// Bits [5:4] are used to assign values to bits [47:32] in the
-/// destination. \n
-/// Bits [7:6] are used to assign values to bits [63:48] in the
-/// destination. \n
-/// Bit value assignments: \n
-/// 00: assigned from bits [15:0] of \a a. \n
-/// 01: assigned from bits [31:16] of \a a. \n
-/// 10: assigned from bits [47:32] of \a a. \n
-/// 11: assigned from bits [63:48] of \a a.
-/// \returns A 64-bit integer vector containing the shuffled values.
-#define _mm_shuffle_pi16(a, n) \
- ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
-
-/// Conditionally copies the values from each 8-bit element in the first
-/// 64-bit integer vector operand to the specified memory location, as
-/// specified by the most significant bit in the corresponding element in the
-/// second 64-bit integer vector operand.
-///
-/// To minimize caching, the data is flagged as non-temporal
-/// (unlikely to be used again soon).
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
-///
-/// \param __d
-/// A 64-bit integer vector containing the values with elements to be copied.
-/// \param __n
-/// A 64-bit integer vector operand. The most significant bit from each 8-bit
-/// element determines whether the corresponding element in operand \a __d
-/// is copied. If the most significant bit of a given element is 1, the
-/// corresponding element in operand \a __d is copied.
-/// \param __p
-/// A pointer to a 64-bit memory location that will receive the conditionally
-/// copied integer values. The address of the memory location does not have
-/// to be aligned.
-static __inline__ void __DEFAULT_FN_ATTRS_MMX
-_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
-{
- __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
-}
-
-/// Computes the rounded averages of the packed unsigned 8-bit integer
-/// values and writes the averages to the corresponding bits in the
-/// destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_avg_pu8(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
-}
-
-/// Computes the rounded averages of the packed unsigned 16-bit integer
-/// values and writes the averages to the corresponding bits in the
-/// destination.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector containing the averages of both operands.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_avg_pu16(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
-}
-
-/// Subtracts the corresponding 8-bit unsigned integer values of the two
-/// 64-bit vector operands and computes the absolute value for each of the
-/// difference. Then sum of the 8 absolute differences is written to the
-/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
-///
-/// \param __a
-/// A 64-bit integer vector containing one of the source operands.
-/// \param __b
-/// A 64-bit integer vector containing one of the source operands.
-/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
-/// sets of absolute differences between both operands. The upper bits are
-/// cleared.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_sad_pu8(__m64 __a, __m64 __b)
-{
- return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
-}
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/// Returns the contents of the MXCSR register as a 32-bit unsigned
-/// integer value.
-///
-/// There are several groups of macros associated with this
-/// intrinsic, including:
-/// <ul>
-/// <li>
-/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
-/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
-/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
-/// _MM_GET_EXCEPTION_STATE().
-/// </li>
-/// <li>
-/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
-/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
-/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
-/// </li>
-/// <li>
-/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
-/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
-/// _MM_GET_ROUNDING_MODE().
-/// </li>
-/// <li>
-/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
-/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
-/// </li>
-/// <li>
-/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
-/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
-/// _MM_GET_DENORMALS_ZERO_MODE().
-/// </li>
-/// </ul>
-///
-/// For example, the following expression checks if an overflow exception has
-/// occurred:
-/// \code
-/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
-/// \endcode
-///
-/// The following expression gets the current rounding mode:
-/// \code
-/// _MM_GET_ROUNDING_MODE()
-/// \endcode
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
-///
-/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
-/// register.
-unsigned int _mm_getcsr(void);
-
-/// Sets the MXCSR register with the 32-bit unsigned integer value.
-///
-/// There are several groups of macros associated with this intrinsic,
-/// including:
-/// <ul>
-/// <li>
-/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
-/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
-/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
-/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
-/// </li>
-/// <li>
-/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
-/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
-/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
-/// of these macros.
-/// </li>
-/// <li>
-/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
-/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
-/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
-/// </li>
-/// <li>
-/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
-/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
-/// one of these macros.
-/// </li>
-/// <li>
-/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
-/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
-/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
-/// </li>
-/// </ul>
-///
-/// For example, the following expression causes subsequent floating-point
-/// operations to round up:
-/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP)
-///
-/// The following example sets the DAZ and FTZ flags:
-/// \code
-/// void setFlags() {
-/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
-/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-/// }
-/// \endcode
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
-///
-/// \param __i
-/// A 32-bit unsigned integer value to be written to the MXCSR register.
-void _mm_setcsr(unsigned int __i);
-
-#if defined(__cplusplus)
-} // extern "C"
-#endif
-
-/// Selects 4 float values from the 128-bit operands of [4 x float], as
-/// specified by the immediate value operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// \code
-/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
-/// \endcode
-///
-/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
-///
-/// \param a
-/// A 128-bit vector of [4 x float].
-/// \param b
-/// A 128-bit vector of [4 x float].
-/// \param mask
-/// An immediate value containing an 8-bit value specifying which elements to
-/// copy from \a a and \a b. \n
-/// Bits [3:0] specify the values copied from operand \a a. \n
-/// Bits [7:4] specify the values copied from operand \a b. \n
-/// The destinations within the 128-bit destination are assigned values as
-/// follows: \n
-/// Bits [1:0] are used to assign values to bits [31:0] in the
-/// destination. \n
-/// Bits [3:2] are used to assign values to bits [63:32] in the
-/// destination. \n
-/// Bits [5:4] are used to assign values to bits [95:64] in the
-/// destination. \n
-/// Bits [7:6] are used to assign values to bits [127:96] in the
-/// destination. \n
-/// Bit value assignments: \n
-/// 00: Bits [31:0] copied from the specified operand. \n
-/// 01: Bits [63:32] copied from the specified operand. \n
-/// 10: Bits [95:64] copied from the specified operand. \n
-/// 11: Bits [127:96] copied from the specified operand.
-/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
-#define _mm_shuffle_ps(a, b, mask) \
- ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
- (int)(mask)))
-
-/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
-/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. \n
-/// Bits [95:64] are written to bits [31:0] of the destination. \n
-/// Bits [127:96] are written to bits [95:64] of the destination.
-/// \param __b
-/// A 128-bit vector of [4 x float].
-/// Bits [95:64] are written to bits [63:32] of the destination. \n
-/// Bits [127:96] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpackhi_ps(__m128 __a, __m128 __b)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
-}
-
-/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
-/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit vector of [4 x float]. \n
-/// Bits [31:0] are written to bits [31:0] of the destination. \n
-/// Bits [63:32] are written to bits [95:64] of the destination.
-/// \param __b
-/// A 128-bit vector of [4 x float]. \n
-/// Bits [31:0] are written to bits [63:32] of the destination. \n
-/// Bits [63:32] are written to bits [127:96] of the destination.
-/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_unpacklo_ps(__m128 __a, __m128 __b)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-/// 32 bits are set to the lower 32 bits of the second parameter. The upper
-/// 96 bits are set to the upper 96 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
-/// instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
-/// written to the upper 96 bits of the result.
-/// \param __b
-/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
-/// written to the lower 32 bits of the result.
-/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_move_ss(__m128 __a, __m128 __b)
-{
- __a[0] = __b[0];
- return __a;
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-/// 64 bits are set to the upper 64 bits of the second parameter. The upper
-/// 64 bits are set to the upper 64 bits of the first parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
-/// written to the upper 64 bits of the result.
-/// \param __b
-/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
-/// written to the lower 64 bits of the result.
-/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movehl_ps(__m128 __a, __m128 __b)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
-}
-
-/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
-/// 64 bits are set to the lower 64 bits of the first parameter. The upper
-/// 64 bits are set to the lower 64 bits of the second parameter.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
-/// written to the lower 64 bits of the result.
-/// \param __b
-/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
-/// written to the upper 64 bits of the result.
-/// \returns A 128-bit floating-point vector of [4 x float].
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_movelh_ps(__m128 __a, __m128 __b)
-{
- return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
-}
-
-/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
-/// float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
-/// from the corresponding elements in this operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-/// values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi16_ps(__m64 __a)
-{
- __m64 __b, __c;
- __m128 __r;
-
- __b = _mm_setzero_si64();
- __b = _mm_cmpgt_pi16(__b, __a);
- __c = _mm_unpackhi_pi16(__a, __b);
- __r = _mm_setzero_ps();
- __r = _mm_cvtpi32_ps(__r, __c);
- __r = _mm_movelh_ps(__r, __r);
- __c = _mm_unpacklo_pi16(__a, __b);
- __r = _mm_cvtpi32_ps(__r, __c);
-
- return __r;
-}
-
-/// Converts a 64-bit vector of 16-bit unsigned integer values into a
-/// 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
-/// destination are copied from the corresponding elements in this operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-/// values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpu16_ps(__m64 __a)
-{
- __m64 __b, __c;
- __m128 __r;
-
- __b = _mm_setzero_si64();
- __c = _mm_unpackhi_pi16(__a, __b);
- __r = _mm_setzero_ps();
- __r = _mm_cvtpi32_ps(__r, __c);
- __r = _mm_movelh_ps(__r, __r);
- __c = _mm_unpacklo_pi16(__a, __b);
- __r = _mm_cvtpi32_ps(__r, __c);
-
- return __r;
-}
-
-/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
-/// into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
-/// from the corresponding lower 4 elements in this operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-/// values from the operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi8_ps(__m64 __a)
-{
- __m64 __b;
-
- __b = _mm_setzero_si64();
- __b = _mm_cmpgt_pi8(__b, __a);
- __b = _mm_unpacklo_pi8(__a, __b);
-
- return _mm_cvtpi16_ps(__b);
-}
-
-/// Converts the lower four unsigned 8-bit integer values from a 64-bit
-/// vector of [8 x u8] into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
-/// destination are copied from the corresponding lower 4 elements in this
-/// operand.
-/// \returns A 128-bit vector of [4 x float] containing the copied and converted
-/// values from the source operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpu8_ps(__m64 __a)
-{
- __m64 __b;
-
- __b = _mm_setzero_si64();
- __b = _mm_unpacklo_pi8(__a, __b);
-
- return _mm_cvtpi16_ps(__b);
-}
-
-/// Converts the two 32-bit signed integer values from each 64-bit vector
-/// operand of [2 x i32] into a 128-bit vector of [4 x float].
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
-///
-/// \param __a
-/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
-/// copied from the elements in this operand.
-/// \param __b
-/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
-/// copied from the elements in this operand.
-/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
-/// copied and converted values from the first operand. The upper 64 bits
-/// contain the copied and converted values from the second operand.
-static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
-{
- __m128 __c;
-
- __c = _mm_setzero_ps();
- __c = _mm_cvtpi32_ps(__c, __b);
- __c = _mm_movelh_ps(__c, __c);
-
- return _mm_cvtpi32_ps(__c, __a);
-}
-
-/// Converts each single-precision floating-point element of a 128-bit
-/// floating-point vector of [4 x float] into a 16-bit signed integer, and
-/// packs the results into a 64-bit integer vector of [4 x i16].
-///
-/// If the floating-point element is NaN or infinity, or if the
-/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
-/// it is converted to 0x8000. Otherwise if the floating-point element is
-/// greater than 0x7FFF, it is converted to 0x7FFF.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [4 x float].
-/// \returns A 64-bit integer vector of [4 x i16] containing the converted
-/// values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtps_pi16(__m128 __a)
-{
- __m64 __b, __c;
-
- __b = _mm_cvtps_pi32(__a);
- __a = _mm_movehl_ps(__a, __a);
- __c = _mm_cvtps_pi32(__a);
-
- return _mm_packs_pi32(__b, __c);
-}
-
-/// Converts each single-precision floating-point element of a 128-bit
-/// floating-point vector of [4 x float] into an 8-bit signed integer, and
-/// packs the results into the lower 32 bits of a 64-bit integer vector of
-/// [8 x i8]. The upper 32 bits of the vector are set to 0.
-///
-/// If the floating-point element is NaN or infinity, or if the
-/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
-/// is converted to 0x80. Otherwise if the floating-point element is greater
-/// than 0x7F, it is converted to 0x7F.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
-///
-/// \param __a
-/// 128-bit floating-point vector of [4 x float].
-/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
-/// converted values and the uppper 32 bits are set to zero.
-static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
-_mm_cvtps_pi8(__m128 __a)
-{
- __m64 __b, __c;
-
- __b = _mm_cvtps_pi16(__a);
- __c = _mm_setzero_si64();
-
- return _mm_packs_pi16(__b, __c);
-}
-
-/// Extracts the sign bits from each single-precision floating-point
-/// element of a 128-bit floating-point vector of [4 x float] and returns the
-/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
-/// to zero.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
-///
-/// \param __a
-/// A 128-bit floating-point vector of [4 x float].
-/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
-/// single-precision floating-point element of the parameter. Bits [31:4] are
-/// set to zero.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_movemask_ps(__m128 __a)
-{
- return __builtin_ia32_movmskps((__v4sf)__a);
-}
-
-
-#define _MM_ALIGN16 __attribute__((aligned(16)))
-
-#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
-
-#define _MM_EXCEPT_INVALID (0x0001U)
-#define _MM_EXCEPT_DENORM (0x0002U)
-#define _MM_EXCEPT_DIV_ZERO (0x0004U)
-#define _MM_EXCEPT_OVERFLOW (0x0008U)
-#define _MM_EXCEPT_UNDERFLOW (0x0010U)
-#define _MM_EXCEPT_INEXACT (0x0020U)
-#define _MM_EXCEPT_MASK (0x003fU)
-
-#define _MM_MASK_INVALID (0x0080U)
-#define _MM_MASK_DENORM (0x0100U)
-#define _MM_MASK_DIV_ZERO (0x0200U)
-#define _MM_MASK_OVERFLOW (0x0400U)
-#define _MM_MASK_UNDERFLOW (0x0800U)
-#define _MM_MASK_INEXACT (0x1000U)
-#define _MM_MASK_MASK (0x1f80U)
-
-#define _MM_ROUND_NEAREST (0x0000U)
-#define _MM_ROUND_DOWN (0x2000U)
-#define _MM_ROUND_UP (0x4000U)
-#define _MM_ROUND_TOWARD_ZERO (0x6000U)
-#define _MM_ROUND_MASK (0x6000U)
-
-#define _MM_FLUSH_ZERO_MASK (0x8000U)
-#define _MM_FLUSH_ZERO_ON (0x8000U)
-#define _MM_FLUSH_ZERO_OFF (0x0000U)
-
-#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
-#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
-#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
-#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
-
-#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
-#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
-#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
-#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
-
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
-do { \
- __m128 tmp3, tmp2, tmp1, tmp0; \
- tmp0 = _mm_unpacklo_ps((row0), (row1)); \
- tmp2 = _mm_unpacklo_ps((row2), (row3)); \
- tmp1 = _mm_unpackhi_ps((row0), (row1)); \
- tmp3 = _mm_unpackhi_ps((row2), (row3)); \
- (row0) = _mm_movelh_ps(tmp0, tmp2); \
- (row1) = _mm_movehl_ps(tmp2, tmp0); \
- (row2) = _mm_movelh_ps(tmp1, tmp3); \
- (row3) = _mm_movehl_ps(tmp3, tmp1); \
-} while (0)
-
-/* Aliases for compatibility. */
-#define _m_pextrw _mm_extract_pi16
-#define _m_pinsrw _mm_insert_pi16
-#define _m_pmaxsw _mm_max_pi16
-#define _m_pmaxub _mm_max_pu8
-#define _m_pminsw _mm_min_pi16
-#define _m_pminub _mm_min_pu8
-#define _m_pmovmskb _mm_movemask_pi8
-#define _m_pmulhuw _mm_mulhi_pu16
-#define _m_pshufw _mm_shuffle_pi16
-#define _m_maskmovq _mm_maskmove_si64
-#define _m_pavgb _mm_avg_pu8
-#define _m_pavgw _mm_avg_pu16
-#define _m_psadbw _mm_sad_pu8
-#define _m_ _mm_
-#define _m_ _mm_
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_MMX
-
-/* Ugly hack for backwards-compatibility (compatible with gcc) */
-#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
-#include <emmintrin.h>
-#endif
-
-#endif /* __XMMINTRIN_H */
+++ /dev/null
-/*===---- xopintrin.h - XOP intrinsics -------------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __X86INTRIN_H
-#error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
-#endif
-
-#ifndef __XOPINTRIN_H
-#define __XOPINTRIN_H
-
-#include <fma4intrin.h>
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128)))
-#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddw_epi8(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epi8(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epi8(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epi16(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epi16(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epi32(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphadddq((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddw_epu8(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epu8(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epu8(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddd_epu16(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epu16(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_haddq_epu32(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubw_epi8(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubd_epi16(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_hsubq_epi32(__m128i __A)
-{
- return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C));
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
-{
- return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C));
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
-{
- return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi8(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi16(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi32(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_rot_epi64(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B);
-}
-
-#define _mm_roti_epi8(A, N) \
- ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N)))
-
-#define _mm_roti_epi16(A, N) \
- ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N)))
-
-#define _mm_roti_epi32(A, N) \
- ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N)))
-
-#define _mm_roti_epi64(A, N) \
- ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N)))
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi8(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi16(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi32(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_shl_epi64(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi8(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi16(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi32(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sha_epi64(__m128i __A, __m128i __B)
-{
- return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B);
-}
-
-#define _mm_com_epu8(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (N)))
-
-#define _mm_com_epu16(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \
- (__v8hi)(__m128i)(B), (N)))
-
-#define _mm_com_epu32(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (N)))
-
-#define _mm_com_epu64(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (N)))
-
-#define _mm_com_epi8(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \
- (__v16qi)(__m128i)(B), (N)))
-
-#define _mm_com_epi16(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \
- (__v8hi)(__m128i)(B), (N)))
-
-#define _mm_com_epi32(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \
- (__v4si)(__m128i)(B), (N)))
-
-#define _mm_com_epi64(A, B, N) \
- ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \
- (__v2di)(__m128i)(B), (N)))
-
-#define _MM_PCOMCTRL_LT 0
-#define _MM_PCOMCTRL_LE 1
-#define _MM_PCOMCTRL_GT 2
-#define _MM_PCOMCTRL_GE 3
-#define _MM_PCOMCTRL_EQ 4
-#define _MM_PCOMCTRL_NEQ 5
-#define _MM_PCOMCTRL_FALSE 6
-#define _MM_PCOMCTRL_TRUE 7
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu8(__m128i __A, __m128i __B)
-{
- return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu16(__m128i __A, __m128i __B)
-{
- return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu32(__m128i __A, __m128i __B)
-{
- return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epu64(__m128i __A, __m128i __B)
-{
- return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi8(__m128i __A, __m128i __B)
-{
- return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi16(__m128i __A, __m128i __B)
-{
- return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi32(__m128i __A, __m128i __B)
-{
- return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comlt_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comle_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comgt_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comge_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comeq_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comneq_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comfalse_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_comtrue_epi64(__m128i __A, __m128i __B)
-{
- return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE);
-}
-
-#define _mm_permute2_pd(X, Y, C, I) \
- ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \
- (__v2df)(__m128d)(Y), \
- (__v2di)(__m128i)(C), (I)))
-
-#define _mm256_permute2_pd(X, Y, C, I) \
- ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \
- (__v4df)(__m256d)(Y), \
- (__v4di)(__m256i)(C), (I)))
-
-#define _mm_permute2_ps(X, Y, C, I) \
- ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
- (__v4si)(__m128i)(C), (I)))
-
-#define _mm256_permute2_ps(X, Y, C, I) \
- ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \
- (__v8sf)(__m256)(Y), \
- (__v8si)(__m256i)(C), (I)))
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_frcz_ss(__m128 __A)
-{
- return (__m128)__builtin_ia32_vfrczss((__v4sf)__A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_frcz_sd(__m128d __A)
-{
- return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A);
-}
-
-static __inline__ __m128 __DEFAULT_FN_ATTRS
-_mm_frcz_ps(__m128 __A)
-{
- return (__m128)__builtin_ia32_vfrczps((__v4sf)__A);
-}
-
-static __inline__ __m128d __DEFAULT_FN_ATTRS
-_mm_frcz_pd(__m128d __A)
-{
- return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A);
-}
-
-static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_frcz_ps(__m256 __A)
-{
- return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A);
-}
-
-static __inline__ __m256d __DEFAULT_FN_ATTRS256
-_mm256_frcz_pd(__m256d __A)
-{
- return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A);
-}
-
-#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS256
-
-#endif /* __XOPINTRIN_H */
+++ /dev/null
-/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsavecintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVECINTRIN_H
-#define __XSAVECINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsavec")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsavec(void *__p, unsigned long long __m) {
- __builtin_ia32_xsavec(__p, __m);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsavec64(void *__p, unsigned long long __m) {
- __builtin_ia32_xsavec64(__p, __m);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsaveintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVEINTRIN_H
-#define __XSAVEINTRIN_H
-
-#ifdef _MSC_VER
-#define _XCR_XFEATURE_ENABLED_MASK 0
-#endif
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsave")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsave(void *__p, unsigned long long __m) {
- __builtin_ia32_xsave(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstor(void *__p, unsigned long long __m) {
- __builtin_ia32_xrstor(__p, __m);
-}
-
-#ifndef _MSC_VER
-#define _xgetbv(A) __builtin_ia32_xgetbv((long long)(A))
-#define _xsetbv(A, B) __builtin_ia32_xsetbv((unsigned int)(A), (unsigned long long)(B))
-#else
-#ifdef __cplusplus
-extern "C" {
-#endif
-unsigned __int64 __cdecl _xgetbv(unsigned int);
-void __cdecl _xsetbv(unsigned int, unsigned __int64);
-#ifdef __cplusplus
-}
-#endif
-#endif /* _MSC_VER */
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsave64(void *__p, unsigned long long __m) {
- __builtin_ia32_xsave64(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstor64(void *__p, unsigned long long __m) {
- __builtin_ia32_xrstor64(__p, __m);
-}
-
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsaveoptintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVEOPTINTRIN_H
-#define __XSAVEOPTINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaveopt")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaveopt(void *__p, unsigned long long __m) {
- __builtin_ia32_xsaveopt(__p, __m);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaveopt64(void *__p, unsigned long long __m) {
- __builtin_ia32_xsaveopt64(__p, __m);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xsavesintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XSAVESINTRIN_H
-#define __XSAVESINTRIN_H
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaves")))
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaves(void *__p, unsigned long long __m) {
- __builtin_ia32_xsaves(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstors(void *__p, unsigned long long __m) {
- __builtin_ia32_xrstors(__p, __m);
-}
-
-#ifdef __x86_64__
-static __inline__ void __DEFAULT_FN_ATTRS
-_xrstors64(void *__p, unsigned long long __m) {
- __builtin_ia32_xrstors64(__p, __m);
-}
-
-static __inline__ void __DEFAULT_FN_ATTRS
-_xsaves64(void *__p, unsigned long long __m) {
- __builtin_ia32_xsaves64(__p, __m);
-}
-#endif
-
-#undef __DEFAULT_FN_ATTRS
-
-#endif
+++ /dev/null
-/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------===
- *
- * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
- * See https://llvm.org/LICENSE.txt for license information.
- * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
- *
- *===-----------------------------------------------------------------------===
- */
-
-#ifndef __IMMINTRIN_H
-#error "Never use <xtestintrin.h> directly; include <immintrin.h> instead."
-#endif
-
-#ifndef __XTESTINTRIN_H
-#define __XTESTINTRIN_H
-
-/* xtest returns non-zero if the instruction is executed within an RTM or active
- * HLE region. */
-/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is
- * supported. */
-static __inline__ int
- __attribute__((__always_inline__, __nodebug__, __target__("rtm")))
- _xtest(void) {
- return __builtin_ia32_xtest();
-}
-
-#endif